In [9]:
#Read file in as raw text
filename = "tmdb_5000_movies.csv"
file = open(filename, "r")
raw_text = ""
for line in file:
raw_text += line
file.close()
raw_text[:1000]
"""
could have used a with open that will close the file for you.
'prove it was there': file.count('id')
"""
Out[9]:
In [10]:
#Then as a list of lines
filename = "tmdb_5000_movies.csv"
file = open(filename, "r")
lines = [line for line in file]
file.close()
""" faster: with open (filname, mode='r') as f:
[print(line) for line in f]
"""
In [1]:
#Then as a Data Frame
import pandas as pd
df = pd.read_csv("tmdb_5000_movies.csv")
df.head()
Out[1]:
In [15]:
# references
# https://www.kaggle.com/fabiendaniel/film-recommendation-engine
# http://www.jeannicholashould.com/tidy-data-in-python.html
In [64]:
import json
import pandas as pd
import numpy as np
df = pd.read_csv("tmdb_5000_movies.csv")
#convert to json
json_columns = ['genres', 'keywords', 'production_countries',
'production_companies', 'spoken_languages']
for column in json_columns:
df[column] = df[column].apply(json.loads)
def get_unique_inner_json(feature):
tmp = []
for i, row in df[feature].iteritems():
for x in range(0,len(df[feature].iloc[i])):
tmp.append(df[feature].iloc[i][x]['name'])
unique_values = set(tmp)
return unique_values
In [35]:
def widen_data(df, feature):
unique_json = get_unique_inner_json(feature)
tmp = []
#rearrange genres
for i, row in df.iterrows():
for x in range(0,len(row[feature])):
for val in unique_json:
if row[feature][x]['name'] == val:
row[val] = 1
tmp.append(row)
new_df = pd.DataFrame(tmp)
new_df[list(unique_json)] = new_df[list(unique_json)].fillna(value=0)
return new_df
In [36]:
genres_arranged_df = widen_data(df, "genres")
#Convert float values to ints
genres_arranged_df[list(get_unique_inner_json("genres"))] = genres_arranged_df[list(get_unique_inner_json("genres"))].astype(int)
In [48]:
genres_arranged_df.head()
Out[48]:
In [60]:
genres_long_df = pd.melt(genres_arranged_df, id_vars=df.columns, value_vars=get_unique_inner_json("genres"), var_name="genre", value_name="genre_val")
In [62]:
genres_long_df = genres_long_df[genres_long_df['genre_val'] == 1]
In [63]:
genres_long_df[genres_long_df['id'] == 19995]
Out[63]:
In [ ]: