In [2]:
## Gnani Beldhari
## Goal One
## Injest a csv file as pure text, then as list of lines, and then as a data frame.
In [1]:
import csv
import numpy as np
import pandas as pd
import json
In [2]:
# Text - Read csv file, iterating through the lines and append the text to the string type
movielistString = ''
with open('tmdb_5000_movies.csv',mode='r' ,encoding='utf8') as f:
for line in f:
movielistString += line
print(movielistString[:1000])
In [9]:
# List - Read csv file, iterating through each line and adding the lines to the List
movieList = []
with open('tmdb_5000_movies.csv',mode='r',encoding='utf8') as f:
for line in f:
movieList.append(line.replace('\n',''))
movieList[1]
Out[9]:
In [38]:
# Data Frame - Use Panda's read_csv function to load the csv file directly to the dataFrame.
df_movielist = pd.read_csv('tmdb_5000_movies.csv')
df_movielist.head(2)
Out[38]:
In [13]:
## Goal Two
## Right now, the file is in a 'narrow' format. In other words, several interesting bits are collapsed into a single field. Let's attempt to make the data frame a 'wide' format. All the collapsed items expanded horizontally
In [30]:
# Read the subset of columns[id, title, genres] to work with genres data
df_genres = df_movielist[['title','genres']].copy()
print(df_genres.shape)
df_genres.head()
Out[30]:
In [31]:
# Convert josn formatted data to text, and assign just the list of values to the genres column in the dataframe.
df_genres['genres'] = df_genres['genres'].apply(pd.read_json)
df_genres['genres'] = df_genres['genres'].apply(lambda x : '' if(x.empty) else x['name'].values )
df_genres.head()
Out[31]:
In [32]:
# Define a method to get the list of unique values from a panda dataframe column(series)
def get_unique(df,col_name):
col_List = []
[[col_List.append(val) for val in gname] for gname in df[col_name]]
col_unique_list = np.unique(col_List)
return col_unique_list
In [33]:
#Get the list of unique columns
unique_cols = get_unique(df_genres,'genres')
unique_cols
Out[33]:
In [34]:
# Create a new dataframe with the list of unique columns and map the values
df_genres_temp = pd.DataFrame(columns=unique_cols)
[[df_genres_temp.set_value(key,col=genre,value=1) for genre in value] for key, value in df_genres['genres'].iteritems()]
#df_genres_temp.fillna(0,inplace=True)
print(df_genres_temp.shape)
df_genres_temp.head()
Out[34]:
In [36]:
# Add the new dataframe to the original dataframe
df_genres_wide = pd.concat([df_genres, df_genres_temp], axis=1, join_axes=[df_genres.index])
print(df_genres_wide.shape)
df_genres_wide.head()
Out[36]:
In [ ]:
# # Wide data sets are good for exploration, but 'long' data sets are good for training.
# Let's attempt to expand all the collapsed field vertically instead of horizontally.
# Does this result in data duplication? What do you think about that? Yes and No are both correct -- but what's the context?
# The long data sets result's may result in data duplication and repetetion, but it provides the simple and standardized way
# of structuring a dataset which makes it easier for analysit/computer to extract needed variables.
# Hadley Wickham described Tidy data should have the followng characteristics.
# 1. Each variable forms a column
# 2. Each Observations forma a row
# 3. Each type of observation forms a table.
# The IMDB data set is untidy with multiple values and observations in a column(genres, production companies etc).
# We can consider each of these columns as a observation units and can be saperated as tables for observation and analysis.
In [37]:
# Use Panda's melt function to display the list of column values as rows value
df_long_genres = pd.melt(frame=df_genres_wide,
id_vars = ['title','genres'],
var_name="genre_category",
value_name="is_type")
df_long_genres.fillna(0,inplace=True)
df_long_genres = df_long_genres.sort_values(by=['title'],ascending=False)
df_long_genres.head(30)
Out[37]:
In [39]:
# Read the subset of columns[id, title, genres] to work with genres data
df_pcompany = df_movielist[['title','production_companies']].copy()
print(df_pcompany.shape)
# Convert josn formatted data to text, and assign just the list of values to the genres column in the dataframe.
df_pcompany['production_companies'] = df_pcompany['production_companies'].apply(pd.read_json)
df_pcompany['production_companies'] = df_pcompany['production_companies'].apply(lambda x : '' if(x.empty) else x['name'].values )
df_pcompany.head()
Out[39]:
In [44]:
#Get the list of unique columns
unique_cols = get_unique(df_pcompany,'production_companies')
unique_cols
# Create a new dataframe with the list of unique columns and map the values
df_pcompany_temp = pd.DataFrame(columns=unique_cols)
[[df_pcompany_temp.set_value(key,col=genre,value=1) for genre in value] for key, value in df_pcompany['production_companies'].iteritems()]
#df_genres_temp.fillna(0,inplace=True)
print(df_pcompany_temp.shape)
# Add the new dataframe to the original dataframe
df_pcompany_wide = pd.concat([df_pcompany, df_pcompany_temp], axis=1, join_axes=[df_pcompany.index])
print(df_pcompany_wide.shape)
df_pcompany_wide.head()
Out[44]:
In [59]:
# Use Panda's melt function to display the list of column values as rows value
df_long_pcompany = pd.melt(frame=df_pcompany_wide,
id_vars = ['title','production_companies'],
var_name="produced_by",
value_name="is_from")
df_long_pcompany.fillna(0,inplace=True)
df_long_pcompany.head()
df_sub_pcompany = df_long_pcompany[df_long_pcompany.is_from==1]
df_sub_pcompany.set_index('title')
df_sub_pcompany[['title','produced_by']].sort_values(by=['title'],ascending=True)
Out[59]:
In [ ]: