Injest a csv file as pure text, then as list of lines, and then as a data frame.


In [14]:
%pwd


Out[14]:
'/Users/alexsimonian/Desktop/NightThree'

In [7]:
import csv

datafile = open('/Users/alexsimonian/Desktop/NightThree/tmdb_5000_movies.csv', 'r')
myreader = csv.reader(datafile)

In [1]:
import pandas as pd

In [2]:
df1 = pd.read_csv('./tmdb_5000_movies.csv', delimiter=",")

In [5]:
df1 = pd.DataFrame(df1)

In [6]:
df1.head(5)


Out[6]:
budget genres homepage id keywords original_language original_title overview popularity production_companies production_countries release_date revenue runtime spoken_languages status tagline title vote_average vote_count
0 237000000 [{"id": 28, "name": "Action"}, {"id": 12, "nam... http://www.avatarmovie.com/ 19995 [{"id": 1463, "name": "culture clash"}, {"id":... en Avatar In the 22nd century, a paraplegic Marine is di... 150.437577 [{"name": "Ingenious Film Partners", "id": 289... [{"iso_3166_1": "US", "name": "United States o... 2009-12-10 2787965087 162.0 [{"iso_639_1": "en", "name": "English"}, {"iso... Released Enter the World of Pandora. Avatar 7.2 11800
1 300000000 [{"id": 12, "name": "Adventure"}, {"id": 14, "... http://disney.go.com/disneypictures/pirates/ 285 [{"id": 270, "name": "ocean"}, {"id": 726, "na... en Pirates of the Caribbean: At World's End Captain Barbossa, long believed to be dead, ha... 139.082615 [{"name": "Walt Disney Pictures", "id": 2}, {"... [{"iso_3166_1": "US", "name": "United States o... 2007-05-19 961000000 169.0 [{"iso_639_1": "en", "name": "English"}] Released At the end of the world, the adventure begins. Pirates of the Caribbean: At World's End 6.9 4500
2 245000000 [{"id": 28, "name": "Action"}, {"id": 12, "nam... http://www.sonypictures.com/movies/spectre/ 206647 [{"id": 470, "name": "spy"}, {"id": 818, "name... en Spectre A cryptic message from Bond’s past sends him o... 107.376788 [{"name": "Columbia Pictures", "id": 5}, {"nam... [{"iso_3166_1": "GB", "name": "United Kingdom"... 2015-10-26 880674609 148.0 [{"iso_639_1": "fr", "name": "Fran\u00e7ais"},... Released A Plan No One Escapes Spectre 6.3 4466
3 250000000 [{"id": 28, "name": "Action"}, {"id": 80, "nam... http://www.thedarkknightrises.com/ 49026 [{"id": 849, "name": "dc comics"}, {"id": 853,... en The Dark Knight Rises Following the death of District Attorney Harve... 112.312950 [{"name": "Legendary Pictures", "id": 923}, {"... [{"iso_3166_1": "US", "name": "United States o... 2012-07-16 1084939099 165.0 [{"iso_639_1": "en", "name": "English"}] Released The Legend Ends The Dark Knight Rises 7.6 9106
4 260000000 [{"id": 28, "name": "Action"}, {"id": 12, "nam... http://movies.disney.com/john-carter 49529 [{"id": 818, "name": "based on novel"}, {"id":... en John Carter John Carter is a war-weary, former military ca... 43.926995 [{"name": "Walt Disney Pictures", "id": 2}] [{"iso_3166_1": "US", "name": "United States o... 2012-03-07 284139100 132.0 [{"iso_639_1": "en", "name": "English"}] Released Lost in our world, found in another. John Carter 6.1 2124

Goal Two


In [8]:
import json
import pandas as pd
import numpy as np

df = pd.read_csv("tmdb_5000_movies.csv")

#convert to json
json_columns = ['genres', 'keywords', 'production_countries',
                    'production_companies', 'spoken_languages']
for column in json_columns:
    df[column] = df[column].apply(json.loads)


def get_unique_inner_json(feature):
    tmp = []
    for i, row in df[feature].iteritems():
        for x in range(0,len(df[feature].iloc[i])):
            tmp.append(df[feature].iloc[i][x]['name'])

    unique_values = set(tmp)
    return unique_values

In [9]:
def widen_data(df, feature):
    unique_json = get_unique_inner_json(feature)
   
    tmp = []
    #rearrange genres
    for i, row in df.iterrows():
        for x in range(0,len(row[feature])):
            for val in unique_json:
                if row[feature][x]['name'] == val:
                    row[val] = 1
                    
        tmp.append(row)
    
    new_df = pd.DataFrame(tmp)
    new_df[list(unique_json)] = new_df[list(unique_json)].fillna(value=0)
    return new_df

In [10]:
genres_arranged_df = widen_data(df, "genres")
genres_arranged_df[list(get_unique_inner_json("genres"))] = genres_arranged_df[list(get_unique_inner_json("genres"))].astype(int)

In [11]:
genres_arranged_df.query('title == "Avatar"')


Out[11]:
Action Adventure Animation Comedy Crime Documentary Drama Family Fantasy Foreign ... production_countries release_date revenue runtime spoken_languages status tagline title vote_average vote_count
0 1 1 0 0 0 0 0 0 1 0 ... [{'iso_3166_1': 'US', 'name': 'United States o... 2009-12-10 2787965087 162.0 [{'iso_639_1': 'en', 'name': 'English'}, {'iso... Released Enter the World of Pandora. Avatar 7.2 11800

1 rows × 40 columns

Goal Three


In [13]:
genres_long_df = pd.melt(genres_arranged_df, id_vars=df.columns, 
                         value_vars=get_unique_inner_json("genres"), var_name="genre", value_name="genre_val")
genres_long_df = genres_long_df[genres_long_df['genre_val'] == 1]
genres_long_df.query('title == "Avatar"')


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-13-2e605b4e02a0> in <module>()
      1 genres_long_df = pd.melt(genres_arranged_df, id_vars=df.columns, 
----> 2                          value_vars=get_unique_inner_json("genres"), var_name="genre", value_name="genre_val")
      3 genres_long_df = genres_long_df[genres_long_df['genre_val'] == 1]
      4 genres_long_df.query('title == "Avatar"')

/Users/alexsimonian/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pandas/core/reshape.py in melt(frame, id_vars, value_vars, var_name, value_name, col_level)
    766         if not isinstance(value_vars, (tuple, list, np.ndarray)):
    767             value_vars = [value_vars]
--> 768         frame = frame.ix[:, id_vars + value_vars]
    769     else:
    770         frame = frame.copy()

/Users/alexsimonian/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pandas/core/indexing.py in __getitem__(self, key)
     82                 pass
     83 
---> 84             return self._getitem_tuple(key)
     85         else:
     86             key = com._apply_if_callable(key, self.obj)

/Users/alexsimonian/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pandas/core/indexing.py in _getitem_tuple(self, tup)
    814                 continue
    815 
--> 816             retval = getattr(retval, self.name)._getitem_axis(key, axis=i)
    817 
    818         return retval

/Users/alexsimonian/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis)
   1012                 raise ValueError('Cannot index with multidimensional key')
   1013 
-> 1014             return self._getitem_iterable(key, axis=axis)
   1015         else:
   1016 

/Users/alexsimonian/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pandas/core/indexing.py in _getitem_iterable(self, key, axis)
   1067 
   1068             # existing labels are unique and indexer are unique
-> 1069             if labels.is_unique and Index(keyarr).is_unique:
   1070 
   1071                 try:

pandas/src/properties.pyx in pandas.lib.cache_readonly.__get__ (pandas/lib.c:45588)()

/Users/alexsimonian/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pandas/indexes/base.py in is_unique(self)
   1098     def is_unique(self):
   1099         """ return if the index has unique values """
-> 1100         return self._engine.is_unique
   1101 
   1102     @property

pandas/index.pyx in pandas.index.IndexEngine.is_unique.__get__ (pandas/index.c:5176)()

pandas/index.pyx in pandas.index.IndexEngine._do_unique_check (pandas/index.c:5243)()

pandas/index.pyx in pandas.index.IndexEngine._ensure_mapping_populated (pandas/index.c:6150)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.map_locations (pandas/hashtable.c:14389)()

/Users/alexsimonian/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pandas/indexes/base.py in __hash__(self)
   1399 
   1400     def __hash__(self):
-> 1401         raise TypeError("unhashable type: %r" % type(self).__name__)
   1402 
   1403     def __setitem__(self, key, value):

TypeError: unhashable type: 'Index'

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: