Injest a csv file as pure text... (just 500 chars)
In [18]:
with open('tmdb_5000_movies.csv','r') as f:
rtext=''
for line in f:
rtext += line
rtext[:500]
Out[18]:
Then as a list of lines... (just one line)
In [19]:
with open('tmdb_5000_movies.csv','r') as f:
lines = [line for line in f]
lines[0]
Out[19]:
Then as a data frame... (just Avatar)
In [20]:
import pandas as pd
df = pd.read_csv("tmdb_5000_movies.csv")
df.query('id == 19995')
Out[20]:
Right now, the file is in a 'narrow' format. In other words, several interesting bits are collapsed into a single field. Let's attempt to make the data frame a 'wide' format. All the collapsed items expanded horizontally.
In [24]:
import json
import pandas as pd
import numpy as np
df = pd.read_csv("tmdb_5000_movies.csv")
#convert to json
json_columns = ['genres', 'keywords', 'production_countries',
'production_companies', 'spoken_languages']
for column in json_columns:
df[column] = df[column].apply(json.loads)
def get_unique_inner_json(feature):
tmp = []
for i, row in df[feature].iteritems():
for x in range(0,len(df[feature].iloc[i])):
tmp.append(df[feature].iloc[i][x]['name'])
unique_values = set(tmp)
return unique_values
In [25]:
def widen_data(df, feature):
unique_json = get_unique_inner_json(feature)
tmp = []
#rearrange genres
for i, row in df.iterrows():
for x in range(0,len(row[feature])):
for val in unique_json:
if row[feature][x]['name'] == val:
row[val] = 1
tmp.append(row)
new_df = pd.DataFrame(tmp)
new_df[list(unique_json)] = new_df[list(unique_json)].fillna(value=0)
return new_df
In [26]:
genres_arranged_df = widen_data(df, "genres")
genres_arranged_df[list(get_unique_inner_json("genres"))] = genres_arranged_df[list(get_unique_inner_json("genres"))].astype(int)
In [27]:
genres_arranged_df.query('title == "Avatar"')
Out[27]:
In [29]:
genres_long_df = pd.melt(genres_arranged_df, id_vars=df.columns, value_vars=get_unique_inner_json("genres"), var_name="genre", value_name="genre_val")
genres_long_df = genres_long_df[genres_long_df['genre_val'] == 1]
genres_long_df.query('title == "Avatar"')
Out[29]: