In [2]:
    
## Gnani Beldhari
## Goal One
## Injest a csv file as pure text, then as list of lines, and then as a data frame.
    
In [1]:
    
import csv
import numpy as np
import pandas as pd
import json
    
In [2]:
    
# Text - Read csv file, iterating through the lines and append the text to the string type
movielistString = ''
with open('tmdb_5000_movies.csv',mode='r' ,encoding='utf8') as f:    
    for line in f:
        movielistString += line        
print(movielistString[:1000])
    
    
In [9]:
    
# List - Read csv file, iterating through each line and adding the lines to the List
movieList = []
with open('tmdb_5000_movies.csv',mode='r',encoding='utf8') as f:
    for line in f:
        movieList.append(line.replace('\n',''))      
movieList[1]
    
    Out[9]:
In [38]:
    
# Data Frame - Use Panda's read_csv function to load the csv file directly to the dataFrame.
df_movielist = pd.read_csv('tmdb_5000_movies.csv')
df_movielist.head(2)
    
    Out[38]:
In [13]:
    
## Goal Two
## Right now, the file is in a 'narrow' format. In other words, several interesting bits are collapsed into a single field. Let's attempt to make the data frame a 'wide' format. All the collapsed items expanded horizontally
    
In [30]:
    
# Read the subset of columns[id, title, genres] to work with genres data
df_genres = df_movielist[['title','genres']].copy()                       
print(df_genres.shape)
df_genres.head()
    
    
    Out[30]:
In [31]:
    
# Convert josn formatted data to text, and assign just the list of values to the genres column in the dataframe.
df_genres['genres'] = df_genres['genres'].apply(pd.read_json)
df_genres['genres'] = df_genres['genres'].apply(lambda x : '' if(x.empty) else x['name'].values )
df_genres.head()
    
    Out[31]:
In [32]:
    
# Define a method to get the list of unique values from a panda dataframe column(series)
def get_unique(df,col_name):    
    col_List = []
    [[col_List.append(val) for val in gname] for gname in df[col_name]]
    col_unique_list = np.unique(col_List)
    return col_unique_list
    
In [33]:
    
#Get the list of unique columns
unique_cols = get_unique(df_genres,'genres')
unique_cols
    
    Out[33]:
In [34]:
    
# Create a new dataframe with the list of unique columns and map the values
df_genres_temp  = pd.DataFrame(columns=unique_cols)
[[df_genres_temp.set_value(key,col=genre,value=1) for genre in value] for key, value in df_genres['genres'].iteritems()]
#df_genres_temp.fillna(0,inplace=True)
print(df_genres_temp.shape)
df_genres_temp.head()
    
    
    Out[34]:
In [36]:
    
# Add the new dataframe to the original dataframe
df_genres_wide = pd.concat([df_genres, df_genres_temp], axis=1, join_axes=[df_genres.index])
print(df_genres_wide.shape)
df_genres_wide.head()
    
    
    Out[36]:
In [ ]:
    
# # Wide data sets are good for exploration, but 'long' data sets are good for training. 
# Let's attempt to expand all the collapsed field vertically instead of horizontally. 
# Does this result in data duplication? What do you think about that? Yes and No are both correct -- but what's the context?
# The long data sets result's may result in data duplication and repetetion, but it provides the simple and standardized way 
# of structuring a dataset which makes it easier for analysit/computer to extract needed variables.
# Hadley Wickham described Tidy data should have the followng characteristics.
# 1. Each variable forms a column
# 2. Each Observations forma a row
# 3. Each type of observation forms a table.
# The IMDB data set is untidy with multiple values and observations in a column(genres, production companies etc). 
# We can consider each of these columns as a observation units and can be saperated as tables for observation and analysis.
    
In [37]:
    
# Use Panda's melt function to display the list of column values as rows value
df_long_genres = pd.melt(frame=df_genres_wide,
                       id_vars = ['title','genres'],
                       var_name="genre_category",
                       value_name="is_type")
df_long_genres.fillna(0,inplace=True)
df_long_genres = df_long_genres.sort_values(by=['title'],ascending=False)
df_long_genres.head(30)
    
    Out[37]:
In [39]:
    
# Read the subset of columns[id, title, genres] to work with genres data
df_pcompany = df_movielist[['title','production_companies']].copy()                       
print(df_pcompany.shape)
# Convert josn formatted data to text, and assign just the list of values to the genres column in the dataframe.
df_pcompany['production_companies'] = df_pcompany['production_companies'].apply(pd.read_json)
df_pcompany['production_companies'] = df_pcompany['production_companies'].apply(lambda x : '' if(x.empty) else x['name'].values )
df_pcompany.head()
    
    
    Out[39]:
In [44]:
    
#Get the list of unique columns
unique_cols = get_unique(df_pcompany,'production_companies')
unique_cols
# Create a new dataframe with the list of unique columns and map the values
df_pcompany_temp  = pd.DataFrame(columns=unique_cols)
[[df_pcompany_temp.set_value(key,col=genre,value=1) for genre in value] for key, value in df_pcompany['production_companies'].iteritems()]
#df_genres_temp.fillna(0,inplace=True)
print(df_pcompany_temp.shape)
# Add the new dataframe to the original dataframe
df_pcompany_wide = pd.concat([df_pcompany, df_pcompany_temp], axis=1, join_axes=[df_pcompany.index])
print(df_pcompany_wide.shape)
df_pcompany_wide.head()
    
    
    Out[44]:
In [59]:
    
# Use Panda's melt function to display the list of column values as rows value
df_long_pcompany = pd.melt(frame=df_pcompany_wide,
                       id_vars = ['title','production_companies'],
                       var_name="produced_by",
                       value_name="is_from")
df_long_pcompany.fillna(0,inplace=True)
df_long_pcompany.head()
df_sub_pcompany = df_long_pcompany[df_long_pcompany.is_from==1]
df_sub_pcompany.set_index('title')
df_sub_pcompany[['title','produced_by']].sort_values(by=['title'],ascending=True)
    
    Out[59]:
In [ ]: