In [1]:
import os
import pandas as pd
import numpy as np
from s3fs.core import S3FileSystem
from scripts import my_plotly as mp
In [2]:
# DELETE - FOR TROUBLESHOOTING ONLY
# import plotly
# import plotly.plotly as py
# import plotly.graph_objs as go
In [3]:
# Validate plotly credentials
mp.validate()
In [4]:
# aws keys stored in ini file in same path
os.environ['AWS_CONFIG_FILE'] = 'aws_config.ini'
s3 = S3FileSystem(anon=False)
key = 'data.csv'
bucket = 'luther-02'
df = pd.read_csv(s3.open('{}/{}'.format(bucket, key),mode='rb'))
# update dates to datetime objects
df['Released'] = pd.to_datetime(df['Released'])
df['Year'] = pd.DatetimeIndex(df['Released']).year
df['Year_Int'] = pd.to_numeric(df['Year'])
df['Month'] = pd.DatetimeIndex(df['Released']).month
# year extremities
yr_start = df['Year'].min(axis=0)
yr_stop = df['Year'].max(axis=0)
In [5]:
title = 'Count of Torrent Titles by Year Released'
_filename ='luther_films_annually'
x_title = 'Release Year'
y_title = 'Number of Titles'
mp.bar_plot_data(df, 'Year', 'blue', title, x_title, y_title, False, _filename)
Out[5]:
In [6]:
def df_year_limit(start, stop, df):
mask = (df['Year'] >= start) & (df['Year'] <= stop)
df = df.loc[mask]
return df
In [7]:
# get count of records before trimming by year cutoff
yr_before = len(df)
# start, stop thresholds
yr_start, yr_stop = (1995, 2015)
# trim by year cutoff
df = df_year_limit(yr_start, yr_stop, df)
yr_after = len(df)
print('{0} entries lost ({1}%) due to date slice between {2} and {3}'.format(yr_before-yr_after,
round((yr_before - yr_after)/yr_before *100, 2), yr_start, yr_stop))
In [8]:
title='Count of Torrent Titles by Release Year ({0}-{1})'.format(yr_start, yr_stop)
_filename='luther_films_annually({0}-{1})'.format(yr_start, yr_stop)
x_title = 'Release Year'
y_title = 'Number of Titles'
mp.bar_plot_data(df, 'Year', 'blue', title, x_title, y_title, False, _filename)
Out[8]:
In [9]:
# split genre strings into a numpy array
def split_to_array(_series):
split_array = np.array(_series.strip().replace(',','').split(' '))
return pd.Series(split_array)
In [10]:
# turn numpy array into count of genre occurances
genres = df['Genre'].apply(split_to_array)
genres = pd.Series(genres.values.ravel()).dropna()
genres = genres.value_counts().sort_values(ascending=False)
# convert series to dataframe for plotting
genre_ser = genres.reset_index()
genre_ser.columns = ['Genre', 'Count']
In [11]:
title='Count of Genre Classifications ({0}-{1})'.format(yr_start, yr_stop)
_filename = 'luther_genre_quantity({0}-{1})'.format(yr_start, yr_stop)
x_title = 'Genre'
y_title = 'Number of Classifications'
mp.bar_plot_data(genre_ser, 'Genre', 'yellow', title, x_title, y_title, True, _filename)
Out[11]:
Given that each film is classified under multiple genres (i.e. - 'Action, Adventure, Thriller'), the most dominant genre is a single genre that occurs the most in the total data set from the film's genre list. In the example 'Action, Adventure, Thriller' - out of that list, 'Action' occurs the most and is therefore used as that films 'dominant genre'.
In [12]:
def convert_frequency(ser, genres=genres):
split_array = np.array(ser.strip().replace(',','').split(' '))
genre = genres.loc[split_array].argmax()
return genre
In [13]:
# add new column to dataframe classifying genre list as single genre of significance
df['Genre_Single'] = df['Genre'].apply(convert_frequency)
# look at number of single genre counts after extraction
df_count = df['Genre_Single'].value_counts().reset_index()
df_count.columns = ['Genre_Single', 'Count']
In [14]:
title = 'Quantity of Dominant Genre Classifications ({0}-{1})'.format(yr_start, yr_stop)
_filename = 'luther_dominant_genres({0}-{1})'.format(yr_start, yr_stop)
x_title = 'Genre'
y_title = 'Number of Classifications'
mp.bar_plot_data(df_count, 'Genre_Single', 'orange', title, x_title, y_title, True, _filename)
Out[14]:
In [15]:
_title = 'Genres Annually ({0}-{1})'.format(yr_start, yr_stop)
_x_title = 'Year'
_y_title = 'Number of Films'
_filename = 'luther_stackedGenres_years({0}-{1})'.format(yr_start, yr_stop)
#mp.get_stackedBar(df, 'Genre_Single', 'Year', _title, _x_title, _y_title, _filename='stackedBar')
In [ ]: