Goal One

Injest a csv file as pure text... (just 500 chars)



In [18]:

    
with open('tmdb_5000_movies.csv','r') as f:
    rtext=''
    for line in f:
       rtext += line
rtext[:500]









    Out[18]:





'budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count\n237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""name"": ""Fantasy""}, {""id"": 878, ""name"": ""Science Fiction""}]",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"": 2964, ""name"": ""'

Then as a list of lines... (just one line)



In [19]:

    
with open('tmdb_5000_movies.csv','r') as f:
    lines = [line for line in f]
lines[0]









    Out[19]:





'budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count\n'

Then as a data frame... (just Avatar)



In [20]:

    
import pandas as pd
df = pd.read_csv("tmdb_5000_movies.csv")
df.query('id == 19995')









    Out[20]:







  
    
      
      budget
      genres
      homepage
      id
      keywords
      original_language
      original_title
      overview
      popularity
      production_companies
      production_countries
      release_date
      revenue
      runtime
      spoken_languages
      status
      tagline
      title
      vote_average
      vote_count
    
  
  
    
      0
      237000000
      [{"id": 28, "name": "Action"}, {"id": 12, "nam...
      http://www.avatarmovie.com/
      19995
      [{"id": 1463, "name": "culture clash"}, {"id":...
      en
      Avatar
      In the 22nd century, a paraplegic Marine is di...
      150.437577
      [{"name": "Ingenious Film Partners", "id": 289...
      [{"iso_3166_1": "US", "name": "United States o...
      2009-12-10
      2787965087
      162.0
      [{"iso_639_1": "en", "name": "English"}, {"iso...
      Released
      Enter the World of Pandora.
      Avatar
      7.2
      11800

Goal Two

Right now, the file is in a 'narrow' format. In other words, several interesting bits are collapsed into a single field. Let's attempt to make the data frame a 'wide' format. All the collapsed items expanded horizontally.

References:

https://www.kaggle.com/fabiendaniel/film-recommendation-engine

http://www.jeannicholashould.com/tidy-data-in-python.html



In [24]:

    
import json
import pandas as pd
import numpy as np

df = pd.read_csv("tmdb_5000_movies.csv")

#convert to json
json_columns = ['genres', 'keywords', 'production_countries',
                    'production_companies', 'spoken_languages']
for column in json_columns:
    df[column] = df[column].apply(json.loads)


def get_unique_inner_json(feature):
    tmp = []
    for i, row in df[feature].iteritems():
        for x in range(0,len(df[feature].iloc[i])):
            tmp.append(df[feature].iloc[i][x]['name'])

    unique_values = set(tmp)
    return unique_values



In [25]:

    
def widen_data(df, feature):
    unique_json = get_unique_inner_json(feature)
   
    tmp = []
    #rearrange genres
    for i, row in df.iterrows():
        for x in range(0,len(row[feature])):
            for val in unique_json:
                if row[feature][x]['name'] == val:
                    row[val] = 1
                    
        tmp.append(row)
    
    new_df = pd.DataFrame(tmp)
    new_df[list(unique_json)] = new_df[list(unique_json)].fillna(value=0)
    return new_df



In [26]:

    
genres_arranged_df = widen_data(df, "genres")
genres_arranged_df[list(get_unique_inner_json("genres"))] = genres_arranged_df[list(get_unique_inner_json("genres"))].astype(int)



In [27]:

    
genres_arranged_df.query('title == "Avatar"')









    Out[27]:







  
    
      
      Action
      Adventure
      Animation
      Comedy
      Crime
      Documentary
      Drama
      Family
      Fantasy
      Foreign
      ...
      production_countries
      release_date
      revenue
      runtime
      spoken_languages
      status
      tagline
      title
      vote_average
      vote_count
    
  
  
    
      0
      1
      1
      0
      0
      0
      0
      0
      0
      1
      0
      ...
      [{u'iso_3166_1': u'US', u'name': u'United Stat...
      2009-12-10
      2787965087
      162.0
      [{u'iso_639_1': u'en', u'name': u'English'}, {...
      Released
      Enter the World of Pandora.
      Avatar
      7.2
      11800
    
  

1 rows × 40 columns

Goal Three



In [29]:

    
genres_long_df = pd.melt(genres_arranged_df, id_vars=df.columns, value_vars=get_unique_inner_json("genres"), var_name="genre", value_name="genre_val")
genres_long_df = genres_long_df[genres_long_df['genre_val'] == 1]
genres_long_df.query('title == "Avatar"')









    Out[29]:







  
    
      
      budget
      genres
      homepage
      id
      keywords
      original_language
      original_title
      overview
      popularity
      production_companies
      ...
      revenue
      runtime
      spoken_languages
      status
      tagline
      title
      vote_average
      vote_count
      genre
      genre_val
    
  
  
    
      19212
      237000000
      [{u'id': 28, u'name': u'Action'}, {u'id': 12, ...
      http://www.avatarmovie.com/
      19995
      [{u'id': 1463, u'name': u'culture clash'}, {u'...
      en
      Avatar
      In the 22nd century, a paraplegic Marine is di...
      150.437577
      [{u'name': u'Ingenious Film Partners', u'id': ...
      ...
      2787965087
      162.0
      [{u'iso_639_1': u'en', u'name': u'English'}, {...
      Released
      Enter the World of Pandora.
      Avatar
      7.2
      11800
      Science Fiction
      1
    
    
      38424
      237000000
      [{u'id': 28, u'name': u'Action'}, {u'id': 12, ...
      http://www.avatarmovie.com/
      19995
      [{u'id': 1463, u'name': u'culture clash'}, {u'...
      en
      Avatar
      In the 22nd century, a paraplegic Marine is di...
      150.437577
      [{u'name': u'Ingenious Film Partners', u'id': ...
      ...
      2787965087
      162.0
      [{u'iso_639_1': u'en', u'name': u'English'}, {...
      Released
      Enter the World of Pandora.
      Avatar
      7.2
      11800
      Fantasy
      1
    
    
      52833
      237000000
      [{u'id': 28, u'name': u'Action'}, {u'id': 12, ...
      http://www.avatarmovie.com/
      19995
      [{u'id': 1463, u'name': u'culture clash'}, {u'...
      en
      Avatar
      In the 22nd century, a paraplegic Marine is di...
      150.437577
      [{u'name': u'Ingenious Film Partners', u'id': ...
      ...
      2787965087
      162.0
      [{u'iso_639_1': u'en', u'name': u'English'}, {...
      Released
      Enter the World of Pandora.
      Avatar
      7.2
      11800
      Adventure
      1
    
    
      62439
      237000000
      [{u'id': 28, u'name': u'Action'}, {u'id': 12, ...
      http://www.avatarmovie.com/
      19995
      [{u'id': 1463, u'name': u'culture clash'}, {u'...
      en
      Avatar
      In the 22nd century, a paraplegic Marine is di...
      150.437577
      [{u'name': u'Ingenious Film Partners', u'id': ...
      ...
      2787965087
      162.0
      [{u'iso_639_1': u'en', u'name': u'English'}, {...
      Released
      Enter the World of Pandora.
      Avatar
      7.2
      11800
      Action
      1
    
  

4 rows × 22 columns

	budget	genres	homepage	id	keywords	original_language	original_title	overview	popularity	production_companies	...	revenue	runtime	spoken_languages	status	tagline	title	vote_average	vote_count	genre	genre_val
19212	237000000	[{u'id': 28, u'name': u'Action'}, {u'id': 12, ...	http://www.avatarmovie.com/	19995	[{u'id': 1463, u'name': u'culture clash'}, {u'...	en	Avatar	In the 22nd century, a paraplegic Marine is di...	150.437577	[{u'name': u'Ingenious Film Partners', u'id': ...	...	2787965087	162.0	[{u'iso_639_1': u'en', u'name': u'English'}, {...	Released	Enter the World of Pandora.	Avatar	7.2	11800	Science Fiction	1
38424	237000000	[{u'id': 28, u'name': u'Action'}, {u'id': 12, ...	http://www.avatarmovie.com/	19995	[{u'id': 1463, u'name': u'culture clash'}, {u'...	en	Avatar	In the 22nd century, a paraplegic Marine is di...	150.437577	[{u'name': u'Ingenious Film Partners', u'id': ...	...	2787965087	162.0	[{u'iso_639_1': u'en', u'name': u'English'}, {...	Released	Enter the World of Pandora.	Avatar	7.2	11800	Fantasy	1
52833	237000000	[{u'id': 28, u'name': u'Action'}, {u'id': 12, ...	http://www.avatarmovie.com/	19995	[{u'id': 1463, u'name': u'culture clash'}, {u'...	en	Avatar	In the 22nd century, a paraplegic Marine is di...	150.437577	[{u'name': u'Ingenious Film Partners', u'id': ...	...	2787965087	162.0	[{u'iso_639_1': u'en', u'name': u'English'}, {...	Released	Enter the World of Pandora.	Avatar	7.2	11800	Adventure	1
62439	237000000	[{u'id': 28, u'name': u'Action'}, {u'id': 12, ...	http://www.avatarmovie.com/	19995	[{u'id': 1463, u'name': u'culture clash'}, {u'...	en	Avatar	In the 22nd century, a paraplegic Marine is di...	150.437577	[{u'name': u'Ingenious Film Partners', u'id': ...	...	2787965087	162.0	[{u'iso_639_1': u'en', u'name': u'English'}, {...	Released	Enter the World of Pandora.	Avatar	7.2	11800	Action	1