In [49]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
pd.__version__
Out[2]:
In [3]:
#read from excel file movie_metadata
imdb = pd.read_excel('movie_metadata.xls')
In [15]:
imdb.head(5)
Out[15]:
In [11]:
#finding the number of rows and columns
imdb.shape
Out[11]:
In [5]:
#finding the title of each column
imdb.columns
Out[5]:
In [16]:
#finding the data types
imdb.dtypes
Out[16]:
In [17]:
#descriptors for the data set
imdb.describe()
Out[17]:
In [13]:
#find out if there are any null values
imdb.isnull().values.any()
Out[13]:
In [18]:
#find out the number of null values in each column
imdb[imdb.columns[:]].isnull().sum()
Out[18]:
Finding out the number of unique languages the movies are in and plotting them
In [22]:
imdb['language'].unique()
Out[22]:
In [23]:
imdb['language'].isnull().sum()
Out[23]:
In [26]:
imdb.groupby(['language'])['language'].count()
Out[26]:
In [32]:
imdb_non_english = imdb.loc[imdb['language'] != 'English']
In [33]:
non_english_movies = imdb_non_english.groupby(['language'])['language'].count()
In [34]:
non_english_movies
Out[34]:
In [59]:
chart = non_english_movies.plot(kind='bar', title = "Count of non english movies")
chart.autoscale(enable=True, axis='y', tight=False)
Find out number of movies having a rating of greater than 7.5
In [60]:
imdb[imdb['imdb_score']>7.5].shape[0]
Out[60]:
In [62]:
imdb_groupby_ratings = imdb.groupby(['imdb_score'])['movie_title'].count()
imdb_groupby_ratings.plot()
Out[62]:
Find out number of movies having a runtime of greater than 3 hours
In [63]:
imdb[imdb['duration']>180].shape[0]
Out[63]:
In [64]:
imdb_groupby_duration = imdb.groupby(['duration'])['movie_title'].count()
imdb_groupby_duration.plot()
Out[64]:
In [ ]: