Goal One


In [9]:
#Read file in as raw text
filename = "tmdb_5000_movies.csv"
file = open(filename, "r")
raw_text = ""
for line in file:
    raw_text += line
file.close()
raw_text[:1000]

""" 
could have used a with open that will close the file for you.
'prove it was there': file.count('id')
"""


Out[9]:
'budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count\n237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""name"": ""Fantasy""}, {""id"": 878, ""name"": ""Science Fiction""}]",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"": 2964, ""name"": ""future""}, {""id"": 3386, ""name"": ""space war""}, {""id"": 3388, ""name"": ""space colony""}, {""id"": 3679, ""name"": ""society""}, {""id"": 3801, ""name"": ""space travel""}, {""id"": 9685, ""name"": ""futuristic""}, {""id"": 9840, ""name"": ""romance""}, {""id"": 9882, ""name"": ""space""}, {""id"": 9951, ""name"": ""alien""}, {""id"": 10148, ""name"": ""tribe""}, {""id"": 10158, ""name"": ""alien planet""}, {""id"": 10987, ""name"": ""cgi""}, {""id"": 11399, ""name"": ""marine""}, {""id"":'

In [10]:
#Then as a list of lines
filename = "tmdb_5000_movies.csv"
file = open(filename, "r")
lines = [line for line in file]
file.close()

""" faster: with open (filname, mode='r') as f: 
        [print(line) for line in f]
"""

In [1]:
#Then as a Data Frame
import pandas as pd
df = pd.read_csv("tmdb_5000_movies.csv")
df.head()


Out[1]:
budget genres homepage id keywords original_language original_title overview popularity production_companies production_countries release_date revenue runtime spoken_languages status tagline title vote_average vote_count
0 237000000 [{"id": 28, "name": "Action"}, {"id": 12, "nam... http://www.avatarmovie.com/ 19995 [{"id": 1463, "name": "culture clash"}, {"id":... en Avatar In the 22nd century, a paraplegic Marine is di... 150.437577 [{"name": "Ingenious Film Partners", "id": 289... [{"iso_3166_1": "US", "name": "United States o... 2009-12-10 2787965087 162.0 [{"iso_639_1": "en", "name": "English"}, {"iso... Released Enter the World of Pandora. Avatar 7.2 11800
1 300000000 [{"id": 12, "name": "Adventure"}, {"id": 14, "... http://disney.go.com/disneypictures/pirates/ 285 [{"id": 270, "name": "ocean"}, {"id": 726, "na... en Pirates of the Caribbean: At World's End Captain Barbossa, long believed to be dead, ha... 139.082615 [{"name": "Walt Disney Pictures", "id": 2}, {"... [{"iso_3166_1": "US", "name": "United States o... 2007-05-19 961000000 169.0 [{"iso_639_1": "en", "name": "English"}] Released At the end of the world, the adventure begins. Pirates of the Caribbean: At World's End 6.9 4500
2 245000000 [{"id": 28, "name": "Action"}, {"id": 12, "nam... http://www.sonypictures.com/movies/spectre/ 206647 [{"id": 470, "name": "spy"}, {"id": 818, "name... en Spectre A cryptic message from Bond’s past sends him o... 107.376788 [{"name": "Columbia Pictures", "id": 5}, {"nam... [{"iso_3166_1": "GB", "name": "United Kingdom"... 2015-10-26 880674609 148.0 [{"iso_639_1": "fr", "name": "Fran\u00e7ais"},... Released A Plan No One Escapes Spectre 6.3 4466
3 250000000 [{"id": 28, "name": "Action"}, {"id": 80, "nam... http://www.thedarkknightrises.com/ 49026 [{"id": 849, "name": "dc comics"}, {"id": 853,... en The Dark Knight Rises Following the death of District Attorney Harve... 112.312950 [{"name": "Legendary Pictures", "id": 923}, {"... [{"iso_3166_1": "US", "name": "United States o... 2012-07-16 1084939099 165.0 [{"iso_639_1": "en", "name": "English"}] Released The Legend Ends The Dark Knight Rises 7.6 9106
4 260000000 [{"id": 28, "name": "Action"}, {"id": 12, "nam... http://movies.disney.com/john-carter 49529 [{"id": 818, "name": "based on novel"}, {"id":... en John Carter John Carter is a war-weary, former military ca... 43.926995 [{"name": "Walt Disney Pictures", "id": 2}] [{"iso_3166_1": "US", "name": "United States o... 2012-03-07 284139100 132.0 [{"iso_639_1": "en", "name": "English"}] Released Lost in our world, found in another. John Carter 6.1 2124

Goal Two


In [15]:
# references
# https://www.kaggle.com/fabiendaniel/film-recommendation-engine
# http://www.jeannicholashould.com/tidy-data-in-python.html

In [64]:
import json
import pandas as pd
import numpy as np

df = pd.read_csv("tmdb_5000_movies.csv")

#convert to json
json_columns = ['genres', 'keywords', 'production_countries',
                    'production_companies', 'spoken_languages']
for column in json_columns:
    df[column] = df[column].apply(json.loads)


def get_unique_inner_json(feature):
    tmp = []
    for i, row in df[feature].iteritems():
        for x in range(0,len(df[feature].iloc[i])):
            tmp.append(df[feature].iloc[i][x]['name'])

    unique_values = set(tmp)
    return unique_values

In [35]:
def widen_data(df, feature):
    unique_json = get_unique_inner_json(feature)
   
    tmp = []
    #rearrange genres
    for i, row in df.iterrows():
        for x in range(0,len(row[feature])):
            for val in unique_json:
                if row[feature][x]['name'] == val:
                    row[val] = 1
                    
        tmp.append(row)
    
    new_df = pd.DataFrame(tmp)
    new_df[list(unique_json)] = new_df[list(unique_json)].fillna(value=0)
    return new_df

In [36]:
genres_arranged_df = widen_data(df, "genres")

#Convert float values to ints
genres_arranged_df[list(get_unique_inner_json("genres"))] = genres_arranged_df[list(get_unique_inner_json("genres"))].astype(int)

In [48]:
genres_arranged_df.head()


Out[48]:
Action Adventure Animation Comedy Crime Documentary Drama Family Fantasy Foreign ... production_countries release_date revenue runtime spoken_languages status tagline title vote_average vote_count
0 1 1 0 0 0 0 0 0 1 0 ... [{'iso_3166_1': 'US', 'name': 'United States o... 2009-12-10 2787965087 162.0 [{'iso_639_1': 'en', 'name': 'English'}, {'iso... Released Enter the World of Pandora. Avatar 7.2 11800
1 1 1 0 0 0 0 0 0 1 0 ... [{'iso_3166_1': 'US', 'name': 'United States o... 2007-05-19 961000000 169.0 [{'iso_639_1': 'en', 'name': 'English'}] Released At the end of the world, the adventure begins. Pirates of the Caribbean: At World's End 6.9 4500
2 1 1 0 0 1 0 0 0 0 0 ... [{'iso_3166_1': 'GB', 'name': 'United Kingdom'... 2015-10-26 880674609 148.0 [{'iso_639_1': 'fr', 'name': 'Français'}, {'is... Released A Plan No One Escapes Spectre 6.3 4466
3 1 0 0 0 1 0 1 0 0 0 ... [{'iso_3166_1': 'US', 'name': 'United States o... 2012-07-16 1084939099 165.0 [{'iso_639_1': 'en', 'name': 'English'}] Released The Legend Ends The Dark Knight Rises 7.6 9106
4 1 1 0 0 0 0 0 0 0 0 ... [{'iso_3166_1': 'US', 'name': 'United States o... 2012-03-07 284139100 132.0 [{'iso_639_1': 'en', 'name': 'English'}] Released Lost in our world, found in another. John Carter 6.1 2124

5 rows × 40 columns

Goal Three


In [60]:
genres_long_df = pd.melt(genres_arranged_df, id_vars=df.columns, value_vars=get_unique_inner_json("genres"), var_name="genre", value_name="genre_val")

In [62]:
genres_long_df = genres_long_df[genres_long_df['genre_val'] == 1]

In [63]:
genres_long_df[genres_long_df['id'] == 19995]


Out[63]:
budget genres homepage id keywords original_language original_title overview popularity production_companies ... revenue runtime spoken_languages status tagline title vote_average vote_count genre genre_val
0 237000000 [{'id': 28, 'name': 'Action'}, {'id': 12, 'nam... http://www.avatarmovie.com/ 19995 [{'id': 1463, 'name': 'culture clash'}, {'id':... en Avatar In the 22nd century, a paraplegic Marine is di... 150.437577 [{'name': 'Ingenious Film Partners', 'id': 289... ... 2787965087 162.0 [{'iso_639_1': 'en', 'name': 'English'}, {'iso... Released Enter the World of Pandora. Avatar 7.2 11800 Adventure 1
4803 237000000 [{'id': 28, 'name': 'Action'}, {'id': 12, 'nam... http://www.avatarmovie.com/ 19995 [{'id': 1463, 'name': 'culture clash'}, {'id':... en Avatar In the 22nd century, a paraplegic Marine is di... 150.437577 [{'name': 'Ingenious Film Partners', 'id': 289... ... 2787965087 162.0 [{'iso_639_1': 'en', 'name': 'English'}, {'iso... Released Enter the World of Pandora. Avatar 7.2 11800 Science Fiction 1
14409 237000000 [{'id': 28, 'name': 'Action'}, {'id': 12, 'nam... http://www.avatarmovie.com/ 19995 [{'id': 1463, 'name': 'culture clash'}, {'id':... en Avatar In the 22nd century, a paraplegic Marine is di... 150.437577 [{'name': 'Ingenious Film Partners', 'id': 289... ... 2787965087 162.0 [{'iso_639_1': 'en', 'name': 'English'}, {'iso... Released Enter the World of Pandora. Avatar 7.2 11800 Action 1
19212 237000000 [{'id': 28, 'name': 'Action'}, {'id': 12, 'nam... http://www.avatarmovie.com/ 19995 [{'id': 1463, 'name': 'culture clash'}, {'id':... en Avatar In the 22nd century, a paraplegic Marine is di... 150.437577 [{'name': 'Ingenious Film Partners', 'id': 289... ... 2787965087 162.0 [{'iso_639_1': 'en', 'name': 'English'}, {'iso... Released Enter the World of Pandora. Avatar 7.2 11800 Fantasy 1

4 rows × 22 columns


In [ ]: