In [1]:
import os
import pandas as pd
import numpy as np
from s3fs.core import S3FileSystem

from scripts import my_plotly as mp

In [2]:
# DELETE - FOR TROUBLESHOOTING ONLY
# import plotly
# import plotly.plotly as py
# import plotly.graph_objs as go

In [3]:
# Validate plotly credentials
mp.validate()

In [4]:
# aws keys stored in ini file in same path
os.environ['AWS_CONFIG_FILE'] = 'aws_config.ini'

s3 = S3FileSystem(anon=False)
key = 'data.csv'
bucket = 'luther-02'

df = pd.read_csv(s3.open('{}/{}'.format(bucket, key),mode='rb'))

# update dates to datetime objects
df['Released'] = pd.to_datetime(df['Released'])
df['Year'] = pd.DatetimeIndex(df['Released']).year
df['Year_Int'] = pd.to_numeric(df['Year'])
df['Month'] = pd.DatetimeIndex(df['Released']).month

# year extremities
yr_start = df['Year'].min(axis=0)
yr_stop = df['Year'].max(axis=0)

In [5]:
title = 'Count of Torrent Titles by Year Released'
_filename ='luther_films_annually'
x_title = 'Release Year'
y_title = 'Number of Titles'

mp.bar_plot_data(df, 'Year', 'blue', title, x_title, y_title, False, _filename)


Out[5]:

Trim Dataset by Years of Interest/Relevance

Due to the low number of titles for the years below 1995, these torrents were removed from the dataset. Also, since the current year (2016) is only partially completed, films released in 2016 were removed from the dataset as well.


In [6]:
def df_year_limit(start, stop, df):
    mask = (df['Year'] >= start) & (df['Year'] <= stop)
    df = df.loc[mask]
    return df

In [7]:
# get count of records before trimming by year cutoff
yr_before = len(df)

# start, stop thresholds
yr_start, yr_stop = (1995, 2015)

# trim by year cutoff
df = df_year_limit(yr_start, yr_stop, df)
yr_after = len(df)


print('{0} entries lost ({1}%) due to date slice between {2} and {3}'.format(yr_before-yr_after, 
                                round((yr_before - yr_after)/yr_before *100, 2), yr_start, yr_stop))


267 entries lost (16.05%) due to date slice between 1995 and 2015

In [8]:
title='Count of Torrent Titles by Release Year ({0}-{1})'.format(yr_start, yr_stop)
_filename='luther_films_annually({0}-{1})'.format(yr_start, yr_stop)
x_title = 'Release Year'
y_title = 'Number of Titles'

mp.bar_plot_data(df, 'Year', 'blue', title, x_title, y_title, False, _filename)


Out[8]:

Genre Classifications


In [9]:
# split genre strings into a numpy array
def split_to_array(_series):
    split_array = np.array(_series.strip().replace(',','').split(' '))
    return pd.Series(split_array)

In [10]:
# turn numpy array into count of genre occurances
genres = df['Genre'].apply(split_to_array)
genres = pd.Series(genres.values.ravel()).dropna()
genres = genres.value_counts().sort_values(ascending=False)

# convert series to dataframe for plotting
genre_ser = genres.reset_index()
genre_ser.columns = ['Genre', 'Count']

In [11]:
title='Count of Genre Classifications ({0}-{1})'.format(yr_start, yr_stop)
_filename = 'luther_genre_quantity({0}-{1})'.format(yr_start, yr_stop)
x_title = 'Genre'
y_title = 'Number of Classifications'

mp.bar_plot_data(genre_ser, 'Genre', 'yellow', title, x_title, y_title, True, _filename)


Out[11]:

Most Dominant Genre out of Genres Given per Film

Given that each film is classified under multiple genres (i.e. - 'Action, Adventure, Thriller'), the most dominant genre is a single genre that occurs the most in the total data set from the film's genre list. In the example 'Action, Adventure, Thriller' - out of that list, 'Action' occurs the most and is therefore used as that films 'dominant genre'.


In [12]:
def convert_frequency(ser, genres=genres):
    split_array = np.array(ser.strip().replace(',','').split(' '))
    genre = genres.loc[split_array].argmax()
    return genre

In [13]:
# add new column to dataframe classifying genre list as single genre of significance
df['Genre_Single'] = df['Genre'].apply(convert_frequency)

# look at number of single genre counts after extraction
df_count = df['Genre_Single'].value_counts().reset_index()
df_count.columns = ['Genre_Single', 'Count']

In [14]:
title = 'Quantity of Dominant Genre Classifications ({0}-{1})'.format(yr_start, yr_stop)
_filename = 'luther_dominant_genres({0}-{1})'.format(yr_start, yr_stop)
x_title = 'Genre'
y_title = 'Number of Classifications'

mp.bar_plot_data(df_count, 'Genre_Single', 'orange', title, x_title, y_title, True, _filename)


Out[14]:

In [15]:
_title = 'Genres Annually ({0}-{1})'.format(yr_start, yr_stop)
_x_title = 'Year'
_y_title = 'Number of Films'
_filename = 'luther_stackedGenres_years({0}-{1})'.format(yr_start, yr_stop)

#mp.get_stackedBar(df, 'Genre_Single', 'Year', _title, _x_title, _y_title, _filename='stackedBar')

In [ ]: