Pandas practice on imdb movie metadata downloaded from kaggle

Source to data set: https://www.kaggle.com/deepmatrix/imdb-5000-movie-dataset

Pandas Cheatsheets: https://assets.datacamp.com/blog_assets/PandasPythonForDataScience.pdf

https://s3.amazonaws.com/quandl-static-content/Documents/Quandl+-+Pandas,+SciPy,+NumPy+Cheat+Sheet.pdf

Getting started and checking the setup



In [49]:

    
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline



In [2]:

    
pd.__version__









    Out[2]:





'0.19.1'

Importing the dataset and understanding it



In [3]:

    
#read from excel file movie_metadata
imdb = pd.read_excel('movie_metadata.xls')



In [15]:

    
imdb.head(5)









    Out[15]:






  
    
      
      color
      director_name
      num_critic_for_reviews
      duration
      director_facebook_likes
      actor_3_facebook_likes
      actor_2_name
      actor_1_facebook_likes
      gross
      genres
      ...
      language
      country
      content_rating
      budget
      title_year
      actor_2_facebook_likes
      imdb_score
      aspect_ratio
      movie_facebook_likes
      movie_title_refined
    
  
  
    
      0
      Color
      James Cameron
      723.0
      178.0
      0.0
      855.0
      Joel David Moore
      1000.0
      760505847.0
      Action|Adventure|Fantasy|Sci-Fi
      ...
      English
      USA
      PG-13
      237000000.0
      2009.0
      936.0
      7.9
      1.78
      33000
      AvatarÂ
    
    
      1
      Color
      Gore Verbinski
      302.0
      169.0
      563.0
      1000.0
      Orlando Bloom
      40000.0
      309404152.0
      Action|Adventure|Fantasy
      ...
      English
      USA
      PG-13
      300000000.0
      2007.0
      5000.0
      7.1
      2.35
      0
      Pirates of the Caribbean: At World's EndÂ
    
    
      2
      Color
      Sam Mendes
      602.0
      148.0
      0.0
      161.0
      Rory Kinnear
      11000.0
      200074175.0
      Action|Adventure|Thriller
      ...
      English
      UK
      PG-13
      245000000.0
      2015.0
      393.0
      6.8
      2.35
      85000
      SpectreÂ
    
    
      3
      Color
      Christopher Nolan
      813.0
      164.0
      22000.0
      23000.0
      Christian Bale
      27000.0
      448130642.0
      Action|Thriller
      ...
      English
      USA
      PG-13
      250000000.0
      2012.0
      23000.0
      8.5
      2.35
      164000
      The Dark Knight RisesÂ
    
    
      4
      NaN
      Doug Walker
      NaN
      NaN
      131.0
      NaN
      Rob Walker
      131.0
      NaN
      Documentary
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      12.0
      7.1
      NaN
      0
      Star Wars: Episode VII - The Force AwakensÂ
    
  

5 rows × 29 columns



In [11]:

    
#finding the number of rows and columns
imdb.shape









    Out[11]:





(5043, 29)



In [5]:

    
#finding the title of each column
imdb.columns









    Out[5]:





Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')



In [16]:

    
#finding the data types
imdb.dtypes









    Out[16]:





color                         object
director_name                 object
num_critic_for_reviews       float64
duration                     float64
director_facebook_likes      float64
actor_3_facebook_likes       float64
actor_2_name                  object
actor_1_facebook_likes       float64
gross                        float64
genres                        object
actor_1_name                  object
movie_title                   object
num_voted_users                int64
cast_total_facebook_likes      int64
actor_3_name                  object
facenumber_in_poster         float64
plot_keywords                 object
movie_imdb_link               object
num_user_for_reviews         float64
language                      object
country                       object
content_rating                object
budget                       float64
title_year                   float64
actor_2_facebook_likes       float64
imdb_score                   float64
aspect_ratio                 float64
movie_facebook_likes           int64
movie_title_refined           object
dtype: object



In [17]:

    
#descriptors for the data set
imdb.describe()









    Out[17]:






  
    
      
      num_critic_for_reviews
      duration
      director_facebook_likes
      actor_3_facebook_likes
      actor_1_facebook_likes
      gross
      num_voted_users
      cast_total_facebook_likes
      facenumber_in_poster
      num_user_for_reviews
      budget
      title_year
      actor_2_facebook_likes
      imdb_score
      aspect_ratio
      movie_facebook_likes
    
  
  
    
      count
      4993.000000
      5028.000000
      4939.000000
      5020.000000
      5036.000000
      4.159000e+03
      5.043000e+03
      5043.000000
      5030.000000
      5022.000000
      4.551000e+03
      4935.000000
      5030.000000
      5043.000000
      4714.000000
      5043.000000
    
    
      mean
      140.194272
      107.201074
      686.509212
      645.009761
      6560.047061
      4.846841e+07
      8.366816e+04
      9699.063851
      1.371173
      272.770808
      3.975262e+07
      2002.470517
      1651.754473
      6.442138
      2.220403
      7525.964505
    
    
      std
      121.601675
      25.197441
      2813.328607
      1665.041728
      15020.759120
      6.845299e+07
      1.384853e+05
      18163.799124
      2.013576
      377.982886
      2.061149e+08
      12.474599
      4042.438863
      1.125116
      1.385113
      19320.445110
    
    
      min
      1.000000
      7.000000
      0.000000
      0.000000
      0.000000
      1.620000e+02
      5.000000e+00
      0.000000
      0.000000
      1.000000
      2.180000e+02
      1916.000000
      0.000000
      1.600000
      1.180000
      0.000000
    
    
      25%
      50.000000
      93.000000
      7.000000
      133.000000
      614.000000
      5.340988e+06
      8.593500e+03
      1411.000000
      0.000000
      65.000000
      6.000000e+06
      1999.000000
      281.000000
      5.800000
      1.850000
      0.000000
    
    
      50%
      110.000000
      103.000000
      49.000000
      371.500000
      988.000000
      2.551750e+07
      3.435900e+04
      3090.000000
      1.000000
      156.000000
      2.000000e+07
      2005.000000
      595.000000
      6.600000
      2.350000
      166.000000
    
    
      75%
      195.000000
      118.000000
      194.500000
      636.000000
      11000.000000
      6.230944e+07
      9.630900e+04
      13756.500000
      2.000000
      326.000000
      4.500000e+07
      2011.000000
      918.000000
      7.200000
      2.350000
      3000.000000
    
    
      max
      813.000000
      511.000000
      23000.000000
      23000.000000
      640000.000000
      7.605058e+08
      1.689764e+06
      656730.000000
      43.000000
      5060.000000
      1.221550e+10
      2016.000000
      137000.000000
      9.500000
      16.000000
      349000.000000



In [13]:

    
#find out if there are any null values
imdb.isnull().values.any()









    Out[13]:





True



In [18]:

    
#find out the number of null values in each column
imdb[imdb.columns[:]].isnull().sum()









    Out[18]:





color                         19
director_name                104
num_critic_for_reviews        50
duration                      15
director_facebook_likes      104
actor_3_facebook_likes        23
actor_2_name                  13
actor_1_facebook_likes         7
gross                        884
genres                         0
actor_1_name                   7
movie_title                    0
num_voted_users                0
cast_total_facebook_likes      0
actor_3_name                  23
facenumber_in_poster          13
plot_keywords                153
movie_imdb_link                0
num_user_for_reviews          21
language                      12
country                        5
content_rating               303
budget                       492
title_year                   108
actor_2_facebook_likes        13
imdb_score                     0
aspect_ratio                 329
movie_facebook_likes           0
movie_title_refined            0
dtype: int64

Exploring the data set

The following section contains a few simple question and answers for exploring the imdb 5000 data set

Finding out the number of unique languages the movies are in and plotting them



In [22]:

    
imdb['language'].unique()









    Out[22]:





array(['English', nan, 'Japanese', 'French', 'Mandarin', 'Aboriginal',
       'Spanish', 'Filipino', 'Hindi', 'Russian', 'Maya', 'Kazakh',
       'Telugu', 'Cantonese', 'Icelandic', 'German', 'Aramaic', 'Italian',
       'Dutch', 'Dari', 'Hebrew', 'Chinese', 'Mongolian', 'Swedish',
       'Korean', 'Thai', 'Polish', 'Bosnian', 'None', 'Hungarian',
       'Portuguese', 'Danish', 'Arabic', 'Norwegian', 'Czech', 'Kannada',
       'Zulu', 'Panjabi', 'Tamil', 'Dzongkha', 'Vietnamese', 'Indonesian',
       'Urdu', 'Romanian', 'Persian', 'Slovenian', 'Greek', 'Swahili'], dtype=object)



In [23]:

    
imdb['language'].isnull().sum()









    Out[23]:





12



In [26]:

    
imdb.groupby(['language'])['language'].count()









    Out[26]:





language
Aboriginal       2
Arabic           5
Aramaic          1
Bosnian          1
Cantonese       11
Chinese          3
Czech            1
Danish           5
Dari             2
Dutch            4
Dzongkha         1
English       4704
Filipino         1
French          73
German          19
Greek            1
Hebrew           5
Hindi           28
Hungarian        1
Icelandic        2
Indonesian       2
Italian         11
Japanese        18
Kannada          1
Kazakh           1
Korean           8
Mandarin        26
Maya             1
Mongolian        1
None             2
Norwegian        4
Panjabi          1
Persian          4
Polish           4
Portuguese       8
Romanian         2
Russian         11
Slovenian        1
Spanish         40
Swahili          1
Swedish          5
Tamil            1
Telugu           1
Thai             3
Urdu             1
Vietnamese       1
Zulu             2
Name: language, dtype: int64



In [32]:

    
imdb_non_english = imdb.loc[imdb['language'] != 'English']



In [33]:

    
non_english_movies = imdb_non_english.groupby(['language'])['language'].count()



In [34]:

    
non_english_movies









    Out[34]:





language
Aboriginal     2
Arabic         5
Aramaic        1
Bosnian        1
Cantonese     11
Chinese        3
Czech          1
Danish         5
Dari           2
Dutch          4
Dzongkha       1
Filipino       1
French        73
German        19
Greek          1
Hebrew         5
Hindi         28
Hungarian      1
Icelandic      2
Indonesian     2
Italian       11
Japanese      18
Kannada        1
Kazakh         1
Korean         8
Mandarin      26
Maya           1
Mongolian      1
None           2
Norwegian      4
Panjabi        1
Persian        4
Polish         4
Portuguese     8
Romanian       2
Russian       11
Slovenian      1
Spanish       40
Swahili        1
Swedish        5
Tamil          1
Telugu         1
Thai           3
Urdu           1
Vietnamese     1
Zulu           2
Name: language, dtype: int64



In [59]:

    
chart = non_english_movies.plot(kind='bar', title = "Count of non english movies")
chart.autoscale(enable=True, axis='y', tight=False)

Find out number of movies having a rating of greater than 7.5



In [60]:

    
imdb[imdb['imdb_score']>7.5].shape[0]









    Out[60]:





747



In [62]:

    
imdb_groupby_ratings = imdb.groupby(['imdb_score'])['movie_title'].count()
imdb_groupby_ratings.plot()









    Out[62]:





<matplotlib.axes._subplots.AxesSubplot at 0x114b664a8>

Find out number of movies having a runtime of greater than 3 hours



In [63]:

    
imdb[imdb['duration']>180].shape[0]









    Out[63]:





66



In [64]:

    
imdb_groupby_duration = imdb.groupby(['duration'])['movie_title'].count()
imdb_groupby_duration.plot()









    Out[64]:





<matplotlib.axes._subplots.AxesSubplot at 0x115b8c6d8>



In [ ]:

	color	director_name	num_critic_for_reviews	duration	director_facebook_likes	actor_3_facebook_likes	actor_2_name	actor_1_facebook_likes	gross	genres	...	language	country	content_rating	budget	title_year	actor_2_facebook_likes	imdb_score	aspect_ratio	movie_facebook_likes	movie_title_refined
0	Color	James Cameron	723.0	178.0	0.0	855.0	Joel David Moore	1000.0	760505847.0	Action\|Adventure\|Fantasy\|Sci-Fi	...	English	USA	PG-13	237000000.0	2009.0	936.0	7.9	1.78	33000	AvatarÂ
1	Color	Gore Verbinski	302.0	169.0	563.0	1000.0	Orlando Bloom	40000.0	309404152.0	Action\|Adventure\|Fantasy	...	English	USA	PG-13	300000000.0	2007.0	5000.0	7.1	2.35	0	Pirates of the Caribbean: At World's EndÂ
2	Color	Sam Mendes	602.0	148.0	0.0	161.0	Rory Kinnear	11000.0	200074175.0	Action\|Adventure\|Thriller	...	English	UK	PG-13	245000000.0	2015.0	393.0	6.8	2.35	85000	SpectreÂ
3	Color	Christopher Nolan	813.0	164.0	22000.0	23000.0	Christian Bale	27000.0	448130642.0	Action\|Thriller	...	English	USA	PG-13	250000000.0	2012.0	23000.0	8.5	2.35	164000	The Dark Knight RisesÂ
4	NaN	Doug Walker	NaN	NaN	131.0	NaN	Rob Walker	131.0	NaN	Documentary	...	NaN	NaN	NaN	NaN	NaN	12.0	7.1	NaN	0	Star Wars: Episode VII - The Force AwakensÂ

	num_critic_for_reviews	duration	director_facebook_likes	actor_3_facebook_likes	actor_1_facebook_likes	gross	num_voted_users	cast_total_facebook_likes	facenumber_in_poster	num_user_for_reviews	budget	title_year	actor_2_facebook_likes	imdb_score	aspect_ratio	movie_facebook_likes
count	4993.000000	5028.000000	4939.000000	5020.000000	5036.000000	4.159000e+03	5.043000e+03	5043.000000	5030.000000	5022.000000	4.551000e+03	4935.000000	5030.000000	5043.000000	4714.000000	5043.000000
mean	140.194272	107.201074	686.509212	645.009761	6560.047061	4.846841e+07	8.366816e+04	9699.063851	1.371173	272.770808	3.975262e+07	2002.470517	1651.754473	6.442138	2.220403	7525.964505
std	121.601675	25.197441	2813.328607	1665.041728	15020.759120	6.845299e+07	1.384853e+05	18163.799124	2.013576	377.982886	2.061149e+08	12.474599	4042.438863	1.125116	1.385113	19320.445110
min	1.000000	7.000000	0.000000	0.000000	0.000000	1.620000e+02	5.000000e+00	0.000000	0.000000	1.000000	2.180000e+02	1916.000000	0.000000	1.600000	1.180000	0.000000
25%	50.000000	93.000000	7.000000	133.000000	614.000000	5.340988e+06	8.593500e+03	1411.000000	0.000000	65.000000	6.000000e+06	1999.000000	281.000000	5.800000	1.850000	0.000000
50%	110.000000	103.000000	49.000000	371.500000	988.000000	2.551750e+07	3.435900e+04	3090.000000	1.000000	156.000000	2.000000e+07	2005.000000	595.000000	6.600000	2.350000	166.000000
75%	195.000000	118.000000	194.500000	636.000000	11000.000000	6.230944e+07	9.630900e+04	13756.500000	2.000000	326.000000	4.500000e+07	2011.000000	918.000000	7.200000	2.350000	3000.000000
max	813.000000	511.000000	23000.000000	23000.000000	640000.000000	7.605058e+08	1.689764e+06	656730.000000	43.000000	5060.000000	1.221550e+10	2016.000000	137000.000000	9.500000	16.000000	349000.000000