In [138]:
import pandas as pd
import csv
import numpy as np
import matplotlib.pyplot
import math
In [139]:
title_file = "titles.csv"
release_date_file = "release_dates.csv"
cast_file = "cast.csv"
files = [title_file, release_date_file,cast_file]
dataframes = {}
for f in files:
csv_file_object = pd.read_csv(open(f), header=0)
dataframes[f] = csv_file_object
In [140]:
dataframes['titles.csv'].count()
len(dataframes['titles.csv'].ix[:,0].unique())
Out[140]:
In [141]:
dataframes['titles.csv'].sort_values(['year']).iloc[0:2]
Out[141]:
In [142]:
len(np.where(dataframes['titles.csv']['title'] == 'Hamlet')[0])
Out[142]:
In [143]:
len(np.where(dataframes['titles.csv']['title'] == 'North by Northwest')[0])
Out[143]:
In [144]:
dataframes['titles.csv'].loc[(dataframes['titles.csv']['title']=='Hamlet'), 'year'].sort_values().iloc[0]
Out[144]:
In [145]:
dataframes['titles.csv'].loc[(dataframes['titles.csv']['title']=='Treasure Island'), 'year'].sort_values()
Out[145]:
In [147]:
dataframes['titles.csv'].loc[(dataframes['titles.csv']['year']==1950), 'year'].count()
Out[147]:
In [149]:
dataframes['titles.csv'].loc[(dataframes['titles.csv']['year']>=1950) & (dataframes['titles.csv']['year']<=1959), 'year'].count()
Out[149]:
In [126]:
len(dataframes['titles.csv'].loc[(dataframes['titles.csv']['title']=='Batman'), 'year'].unique())
Out[126]:
In [150]:
dataframes['cast.csv'].loc[(dataframes['cast.csv']['title']=='Inception'), 'character'].count()
Out[150]:
In [137]:
dataframes['titles.csv'].groupby('year')
#dataframes['titles.csv']['decade']= pd.Series(math.floor(dataframes['titles.csv']['year'].apply(int)/10.0)*10)
#math.floor(dataframes['titles.csv']['year'].apply(float))
Out[137]:
In [ ]: