In [138]:
import pandas as pd
import csv
import numpy as np
import matplotlib.pyplot
import math

In [139]:
title_file = "titles.csv"
release_date_file = "release_dates.csv"
cast_file = "cast.csv"

files = [title_file, release_date_file,cast_file]
dataframes = {}

for f in files:
    csv_file_object = pd.read_csv(open(f), header=0)
    dataframes[f] = csv_file_object

Question 1: How many movies are listed in the titles dataframe?

210591

Bonus

Unique movies == 192839


In [140]:
dataframes['titles.csv'].count()
len(dataframes['titles.csv'].ix[:,0].unique())


Out[140]:
192839

Question 2: What are the earliest two films listed in the titles dataframe?

Miss Jerry (1894) && Reproduction of the Corbett and Fitzsimmons Fight (1897)


In [141]:
dataframes['titles.csv'].sort_values(['year']).iloc[0:2]


Out[141]:
title year
161037 Miss Jerry 1894
83650 Reproduction of the Corbett and Fitzsimmons Fight 1897

Question 3: How many movies have the title "Hamlet"?

19


In [142]:
len(np.where(dataframes['titles.csv']['title'] == 'Hamlet')[0])


Out[142]:
19

Question 4: How many movies are titled "North by Northwest"?

1


In [143]:
len(np.where(dataframes['titles.csv']['title'] == 'North by Northwest')[0])


Out[143]:
1

Question 5: When was the first movie titled "Hamlet" made?

1910


In [144]:
dataframes['titles.csv'].loc[(dataframes['titles.csv']['title']=='Hamlet'), 'year'].sort_values().iloc[0]


Out[144]:
1910

Question 6: List all of the "Treasure Island" movies from earliest to most recent

  • 1918
  • 1920
  • 1934
  • 1950
  • 1972
  • 1973
  • 1985
  • 1999

In [145]:
dataframes['titles.csv'].loc[(dataframes['titles.csv']['title']=='Treasure Island'), 'year'].sort_values()


Out[145]:
186595    1918
46520     1920
188082    1934
88003     1950
207718    1972
101151    1973
186007    1985
162493    1999
Name: year, dtype: int64

Question 7. How many movies were made in the year 1950?

1033


In [147]:
dataframes['titles.csv'].loc[(dataframes['titles.csv']['year']==1950), 'year'].count()


Out[147]:
1033

Question 8. How many movies were made from 1950 through 1959?

11999


In [149]:
dataframes['titles.csv'].loc[(dataframes['titles.csv']['year']>=1950) & (dataframes['titles.csv']['year']<=1959), 'year'].count()


Out[149]:
11999

Question 9. In what years has a movie titled "Batman" been released?

2


In [126]:
len(dataframes['titles.csv'].loc[(dataframes['titles.csv']['title']=='Batman'), 'year'].unique())


Out[126]:
2

Question 10. How many roles were there in the movie "Inception"?

72


In [150]:
dataframes['cast.csv'].loc[(dataframes['cast.csv']['title']=='Inception'), 'character'].count()


Out[150]:
72

WE GIVE UP


In [137]:
dataframes['titles.csv'].groupby('year')
#dataframes['titles.csv']['decade']= pd.Series(math.floor(dataframes['titles.csv']['year'].apply(int)/10.0)*10)
#math.floor(dataframes['titles.csv']['year'].apply(float))


Out[137]:
<pandas.core.groupby.DataFrameGroupBy object at 0x263bf0b90>

In [ ]: