In [1]:
import os
import pandas as pd
import numpy as np

import itertools
import operator
import string
from collections import defaultdict, Counter
from s3fs.core import S3FileSystem

import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.tools import FigureFactory as FF

Plotly Setup


In [2]:
# plotly validate with credentials
with open('../_credentials/plotly.txt', 'r') as infile:
    user, pw = infile.read().strip().split(', ')
    
plotly.tools.set_credentials_file(username=user, api_key=pw)

text_color = 'rgb(107, 107, 107)'
colors_dict = {'grey':'rgb(189, 195, 199)', 'aqua':'rgb( 54, 215, 183)', 'navy':'rgb( 31,  58, 147)',
            'purple':'rgb(142,  68, 173)', 'blue':'rgb( 25, 181, 254)', 'green':'rgb( 46, 204, 113)',
            'yellow':'rgb(253, 231,  76)', 'orange':'rgb(250, 121,  33)', 'red':'rgb(242,  38,  19)'}
colors_lst = [colors_dict['yellow'], colors_dict['orange'], colors_dict['red'], 
              colors_dict['green'], colors_dict['blue'], colors_dict['purple'], 
              colors_dict['navy'], colors_dict['aqua'], colors_dict['grey']]

Load Cleaned Data from S3


In [3]:
# aws keys stored in ini file in same path
os.environ['AWS_CONFIG_FILE'] = 'aws_config.ini'

s3 = S3FileSystem(anon=False)
key = 'data.csv'
bucket = 'luther-02'

df = pd.read_csv(s3.open('{}/{}'.format(bucket, key),mode='rb'))

# update dates to datetime objects
df['Released'] = pd.to_datetime(df['Released'])
df['Year'] = pd.DatetimeIndex(df['Released']).year
df['Year_Int'] = pd.to_numeric(df['Year'])
df['Month'] = pd.DatetimeIndex(df['Released']).month

# year extremities
yr_start = df['Year'].min(axis=0)
yr_stop = df['Year'].max(axis=0)

Number of Torrent Titles by Release Year


In [4]:
# number of titles per year in dataset
df_yr = df['Year'].value_counts().reset_index()
df_yr.columns = ['Year','Count']

# create plotly data trace
trace = go.Bar(x=df_yr['Year'], y=df_yr['Count'], marker=dict(color=colors_dict['red']))

In [5]:
def bar_plot_data(_dataframe, _label, color):
    df_temp = _dataframe[_label].value_counts().reset_index()
    df_temp.columns = [_label,'Count']

    # create plotly data trace
    trace = go.Bar(x=df_temp[_label], y=df_temp['Count'], marker=dict(color=colors_dict[color]))
    
    data = [trace]
    layout = go.Layout(
        title='Quantity of Torrent Titles by Year Released ({0}-{1})'.format(yr_start, yr_stop),
        xaxis=dict(
            title='Release Year',
            tickfont=dict(size=14, color=text_color)),
        yaxis=dict(
            title='Number of Titles',
            titlefont=dict(size=16, color=text_color),
            tickfont=dict(size=14, color=text_color)),
        barmode='group',
        bargap=0.15,
        bargroupgap=0.1)

    fig = go.Figure(data=data, layout=layout)
    return py.iplot(fig, filename='luther_titles_annually({0}-{1})'.format(yr_start, yr_stop))

In [6]:
bar_plot_data(df, 'Year', 'red')


Out[6]:

In [7]:
data = [trace]
layout = go.Layout(
    title='Quantity of Torrent Titles by Year Released ({0}-{1})'.format(yr_start, yr_stop),
    xaxis=dict(
        title='Release Year',
        tickfont=dict(size=14, color=text_color)),
    yaxis=dict(
        title='Number of Titles',
        titlefont=dict(size=16, color=text_color),
        tickfont=dict(size=14, color=text_color)),
    barmode='group',
    bargap=0.15,
    bargroupgap=0.1)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='luther_titles_annually({0}-{1})'.format(yr_start, yr_stop))


Out[7]:

Trim Dataset by Years of Interest/Relevance

Due to the low number of titles for the years below 1995, these torrents were removed from the dataset. Also, since the current year (2016) is only partially completed, films released in 2016 were removed from the dataset as well.


In [8]:
def df_year_limit(start, stop, df):
    mask = (df['Year'] >= start) & (df['Year'] <= stop)
    df = df.loc[mask]
    return df

In [9]:
# get count of records before trimming by year cutoff
yr_before = len(df)
print('{0} records in dataframe before trimming by year cutoff'.format(yr_before))


1664 records in dataframe before trimming by year cutoff

In [10]:
yr_start, yr_stop = (1995, 2015)

# trim by year cutoff
df = df_year_limit(yr_start, yr_stop, df)
yr_after = len(df)

print('{0} entries lost ({1}%) due to date cutoff between {2} and {3}'.format(yr_before-yr_after, 
                                round((yr_before - yr_after)/yr_before *100, 2), yr_start, yr_stop))


267 entries lost (16.05%) due to date cutoff between 1995 and 2015

In [11]:
# number of titles per year in dataset 
df_yr = df['Year'].value_counts().reset_index()
df_yr.columns = ['Year','Count']

trace = go.Bar(x=df_yr['Year'], y=df_yr['Count'], marker=dict(color=colors_dict['blue']))

In [12]:
data = [trace]
layout = go.Layout(
    title='Number of Torrent Titles by Release Year ({0}-{1})'.format(yr_start, yr_stop),
    xaxis=dict(
        title='Release Year',
        tickfont=dict(size=14,color=text_color)),
    yaxis=dict(
        title='Number of Titles',
        titlefont=dict(size=16, color=text_color),
        tickfont=dict(size=14, color=text_color)),
    barmode='group',
    bargap=0.15,
    bargroupgap=0.1)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='luther_films_annually({0}-{1})'.format(yr_start, yr_stop))


Out[12]:

Quantity Genre Classifications


In [13]:
# split genre strings into a numpy array
def split_to_array(ser):
    split_array = np.array(ser.strip().replace(',','').split(' '))
    return pd.Series(split_array)

# turn numpy array into count of genre occurances
genres = df['Genre'].apply(split_to_array)
genres = pd.Series(genres.values.ravel()).dropna()
genres = genres.value_counts().sort_values(ascending=False)

# convert series to dataframe for plotting
genre_ser = genres.reset_index()
genre_ser.columns = ['Genre', 'Count']

# bar chart of each genre in dataset
trace = go.Bar(x=genre_ser['Genre'], y=genre_ser['Count'], marker=dict(color=colors_dict['yellow']))

In [14]:
data = [trace]
layout = go.Layout(
    title='Count of Genre Classifications ({0}-{1})'.format(yr_start, yr_stop),
    xaxis=dict(
        title='Genre',
        tickfont=dict(size=14, color=text_color)),
    yaxis=dict(
        title='Number of Classifications',
        titlefont=dict(size=16, color=text_color),
        tickfont=dict(size=14, color=text_color)),
    barmode='group',
    bargap=0.15,
    bargroupgap=0.1)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='luther_genre_quantity({0}-{1})'.format(yr_start, yr_stop))


Out[14]:

Most Dominant Genre out of Genres Given per Title


In [15]:
def convert_frequency(ser, genres=genres):
    split_array = np.array(ser.strip().replace(',','').split(' '))
    genre = genres.loc[split_array].argmax()
    return genre

# add new column to dataframe classifying genre list as single genre of significance
df['Genre_Single'] = df['Genre'].apply(convert_frequency)

In [16]:
# look at number of single genre counts after extraction
df_count = df['Genre_Single'].value_counts().reset_index()
df_count.columns = ['Genre_Single', 'Count']

# bar chart of significant single genre in dataset
trace = go.Bar(x=df_count['Genre_Single'], y=df_count['Count'], marker=dict(color=colors_dict['yellow']))

In [17]:
data = [trace]
layout = go.Layout(
    title='Quantity of Dominant Genre Classifications ({0}-{1})'.format(yr_start, yr_stop),
    xaxis=dict(
        title='Genre',
        tickfont=dict(size=14, color=text_color)),
    yaxis=dict(
        title='Quantity of Classifications',
        titlefont=dict(size=16, color=text_color),
        tickfont=dict(size=14, color=text_color)),
    barmode='group',
    bargap=0.15,
    bargroupgap=0.1)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='luther_dominant_genres({0}-{1})'.format(yr_start, yr_stop))


Out[17]:

In [18]:
def df_genre_limit(df, genres):
    df = df[~df['Genre_Single'].isin(genres)]
    return df

In [19]:
# get count of records before trimming by dominant genre cutoff
genre_before = len(df)
print('{0} records in dataframe before trimming by genres'.format(genre_before))


1397 records in dataframe before trimming by genres

In [20]:
# trim by dominant genres
cut_genres = ['Romance', 'Western']
df = df_genre_limit(df, cut_genres)
genre_after = len(df)

str_genres = ', '.join(cut_genres)
print('{0} entries lost ({1}%) due to droppping dominant genres {2}'.format(genre_before-genre_after, 
                                round((genre_before - genre_after)/genre_before *100, 2), str_genres))


0 entries lost (0.0%) due to droppping dominant genres Romance, Western

Dominant Genre Quantities per Year


In [21]:
def get_stackedBar_trace(x_category, y_counts, _name, ind):
    '''
    x_category -- category from feature set
    y_counts -- count of x_category in feature set
    _name -- _name of x_category
    ind -- number indices for color list

    Return: Plotly data trace for bar chart
    '''
    return go.Bar(x=x_category, y=y_counts, name=_name, marker=dict(color=colors_lst[ind]), opacity=0.8)

In [22]:
def get_stackedBar_traces(df, feature, count_label):
    traces = []
    
    for ind, _feat in enumerate(df[feature].unique().tolist()):
        temp_df = df[df[feature] == _feat]
        _value_counts = temp_df[count_label].value_counts()
        temp_dict = _value_counts.to_dict()
        temp_dict = sorted(temp_dict.items())

        feature_lst = [ft for ft, ct in temp_dict]
        count_lst = [ct for ft,ct in temp_dict]

        traces.append(get_stackedBar_trace(feature_lst, count_lst, _feat, ind))
    
    return traces

In [23]:
def get_stackedBar(_dataframe, feature, count_label, _title, _x_title, _y_title, _filename='stackedBar'):
    
    date = get_stackedBar_traces(_dataframe, feature, count_label)
    
    layout = go.Layout(
        title=_title,
        xaxis=dict(
            title=_x_title,
            tickfont=dict(size=14, color='rgb(107, 107, 107)')
        ),
        yaxis=dict(
            title=_y_title,
            titlefont=dict(size=16, color='rgb(107, 107, 107)'),
            tickfont=dict(size=14, color='rgb(107, 107, 107)'),
            dtick=20,
        ),
        barmode='stack',)

    fig = go.Figure(data=data, layout=layout)

    return py.iplot(fig, filename=_filename)

In [24]:
_title = 'Genres Annually ({0}-{1})'.format(yr_start, yr_stop)
_x_title = 'Year'
_y_title = 'Number of Films'
_filename = 'luther_stackedGenres_years({0}-{1})'.format(yr_start, yr_stop)

get_stackedBar(df, 'Genre_Single', 'Year', _title, _x_title, _y_title, _filename='stackedBar')


Out[24]:

In [27]:
traces = []

for i,genre in enumerate(df['Genre_Single'].unique().tolist()):
    _genre_df = df[df['Genre_Single'] == genre]
    _value_counts = _genre_df['Year'].value_counts()
    gen = _value_counts.to_dict()
    gen = sorted(gen.items())
    
    year_lst = [yr for yr,ct in gen]
    count_lst = [ct for yr,ct in gen]
    
    traces.append(make_bar_trace(year_lst, count_lst, genre, i))


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-27-9b468d06020c> in <module>()
     10     count_lst = [ct for yr,ct in gen]
     11 
---> 12     traces.append(make_bar_trace(year_lst, count_lst, genre, i))

NameError: name 'make_bar_trace' is not defined

In [28]:
data = traces[::-1]

layout = go.Layout(
    title='Genres Annually ({0}-{1})'.format(yr_start, yr_stop),
    xaxis=dict(
        title='Year',
        tickfont=dict(size=14, color='rgb(107, 107, 107)')
    ),
    yaxis=dict(
        title='Number of Films',
        titlefont=dict(size=16, color='rgb(107, 107, 107)'),
        tickfont=dict(size=14, color='rgb(107, 107, 107)'),
        dtick=20,
    ),
    barmode='stack',)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='stacked-bar')


---------------------------------------------------------------------------
PlotlyEmptyDataError                      Traceback (most recent call last)
<ipython-input-28-fa2bf7c5ca52> in <module>()
     17 fig = go.Figure(data=data, layout=layout)
     18 
---> 19 py.iplot(fig, filename='stacked-bar')

~/anaconda/lib/python3.6/site-packages/plotly/plotly/plotly.py in iplot(figure_or_data, **plot_options)
    132     if 'auto_open' not in plot_options:
    133         plot_options['auto_open'] = False
--> 134     url = plot(figure_or_data, **plot_options)
    135 
    136     if isinstance(figure_or_data, dict):

~/anaconda/lib/python3.6/site-packages/plotly/plotly/plotly.py in plot(figure_or_data, validate, **plot_options)
    193 
    194     """
--> 195     figure = tools.return_figure_from_figure_or_data(figure_or_data, validate)
    196     for entry in figure['data']:
    197         if ('type' in entry) and (entry['type'] == 'scattergl'):

~/anaconda/lib/python3.6/site-packages/plotly/tools.py in return_figure_from_figure_or_data(figure_or_data, validate_figure)
   1396         if not figure['data']:
   1397             raise exceptions.PlotlyEmptyDataError(
-> 1398                 "Empty data list found. Make sure that you populated the "
   1399                 "list of data objects you're sending and try again.\n"
   1400                 "Questions? Visit support.plot.ly"

PlotlyEmptyDataError: Empty data list found. Make sure that you populated the list of data objects you're sending and try again.
Questions? Visit support.plot.ly

In [29]:
# ratings
df_rated = df['Rated'].value_counts().reset_index()
df_rated.columns = ['Rated', 'Count']
df_rated


Out[29]:
Rated Count
0 PG-13 603
1 R 576
2 PG 190
3 G 18
4 NOT RATED 2
5 TV-14 1
6 NC-17 1

In [30]:
def df_genre_limit(df, ratings):
    df = df[~df['Rated'].isin(ratings)]
    return df

In [31]:
ratings_remove = ['NOT RATED', 'X', 'TV-14', 'NC-17']
df = df_genre_limit(df, ratings_remove)

df_rated = df['Rated'].value_counts().reset_index()
df_rated.columns = ['Rated', 'Count']

# bar chart of ratings
rated_traces = go.Bar(x=df_rated['Rated'], y=df_count['Count'], marker=dict(color=colors_dict['blue']))

In [32]:
data = [rated_traces]
layout = go.Layout(
    title='Quantity of Dominant Genre Classifications ({0}-{1})'.format(yr_start, yr_stop),
    xaxis=dict(
        title='Genre',
        tickfont=dict(size=14, color=text_color)),
    yaxis=dict(
        title='Quantity of Classifications',
        titlefont=dict(size=16, color=text_color),
        tickfont=dict(size=14, color=text_color)),
    barmode='group',
    bargap=0.15,
    bargroupgap=0.1)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='luther_dominant_genres({0}-{1})'.format(yr_start, yr_stop))


Out[32]:

Remove Films Not Rated - PG-13, PG, G, or R


In [33]:
# get count of records before trimming by year cutoff
rated_before = len(df)
print('{0} records in dataframe before trimming by rating'.format(rated_before))


1393 records in dataframe before trimming by rating

In [34]:
ratings = ['PG-13', 'PG', 'G', 'R']
df = df.loc[df['Rated'].isin(ratings)]

rated_after = len(df)

print('{0} entries lost ({1}%) due to limiting to only {2} ratings'.format(rated_before-rated_after, 
                        round((rated_before-rated_after)/rated_before *100, 2), ', '.join(ratings)))

print('{0} entries lost total ({1}%)'.format(yr_before-rated_after,
                                            round((yr_before-rated_after)/yr_before *100, 2)))


6 entries lost (0.43%) due to limiting to only PG-13, PG, G, R ratings
277 entries lost total (16.65%)

In [35]:
# Combine Genre_Single and Rating as a new label
df['Genre_Rated'] = df['Genre_Single'] + ' ' + df['Rated']

df['Gen_Rat_Run'] = df['Genre_Rated'] + ' ' + df['Runtime'].apply(lambda x: str(x))

df['Gen_Rat_Bud'] = df['Genre_Rated'] + ' ' + df['Prod_Budget'].apply(lambda x: str(x))

df['Gen_Sin'] = df['Genre_Single']

In [36]:
df.columns


Out[36]:
Index(['Title', 'Released', 'Prod_Budget', 'Dom_Gross', 'World_Gross', 'Rated',
       'Runtime', 'Genre', 'Director', 'Actors', 'Pirate_Count',
       'Torrentz_Count', 'Zoogle_Ver_Count', 'Year', 'Month', 'Total_Torrents',
       'Year_Int', 'Genre_Single', 'Genre_Rated', 'Gen_Rat_Run', 'Gen_Rat_Bud',
       'Gen_Sin'],
      dtype='object')

In [37]:
colors_scat = colors_lst[:-2][::-1]
df_scat = df[['Prod_Budget', 'Runtime', 'Gen_Rat_Bud', 'Gen_Rat_Run', 'Gen_Sin']]

fig = FF.create_scatterplotmatrix(df_scat, diag='histogram', index='Gen_Sin', 
                                    height=1000, width=1000, colormap=colors_scat[::-1])
py.iplot(fig, filename='Luther Scatterplot Matrix')


/Users/bryant/anaconda/lib/python3.6/site-packages/plotly/tools.py:1422: UserWarning:

plotly.tools.FigureFactory.create_scatterplotmatrix is deprecated. Use plotly.figure_factory.create_scatterplotmatrix

Out[37]:

Log Transform Scatter Matrix


In [38]:
df['Log_Prod_Bud'] = np.log(df['Prod_Budget'])
df['Log_Runtime'] = np.log(df['Runtime'])
df['Log_Ttl_Tor'] = np.log(df['Total_Torrents'])

In [39]:
colors_scat = colors_lst[:-2][::-1]
df_scat = df[['Log_Ttl_Tor', 'Log_Prod_Bud', 'Log_Runtime', 'Gen_Rat_Bud', 'Gen_Rat_Run', 'Gen_Sin']]

fig = FF.create_scatterplotmatrix(df_scat, diag='histogram', index='Gen_Sin', 
                                    height=1000, width=1000, colormap=colors_scat[::-1])
_ = py.iplot(fig, filename='Log Luther Scatterplot Matrix')


/Users/bryant/anaconda/lib/python3.6/site-packages/plotly/tools.py:1422: UserWarning:

plotly.tools.FigureFactory.create_scatterplotmatrix is deprecated. Use plotly.figure_factory.create_scatterplotmatrix

Drama Only


In [40]:
df_drama = df[df['Genre_Single'] == 'Drama'].reset_index()
df_drama = df_drama.drop('index',axis=1)
df_drama['Log_Bud_Rated'] = df['Log_Prod_Bud'].apply(lambda x: str(x)) + ' ' + df['Rated']

In [41]:
#df_scat = df_drama[['Log_Ttl_Tor', 'Log_Prod_Bud', 'Log_Runtime', 'Log_Bud_Rated', 'Gen_Sin']]
df_scat = df_drama[['Total_Torrents', 'Prod_Budget', 'Runtime', 'Rated', 'Gen_Sin']]

fig = FF.create_scatterplotmatrix(df_scat, diag='histogram', index='Gen_Sin', 
                                    height=1000, width=1000, colormap=colors_scat[::-1])
_ = py.iplot(fig, filename='Log Drama Luther Scatterplot Matrix')


/Users/bryant/anaconda/lib/python3.6/site-packages/plotly/tools.py:1422: UserWarning:

plotly.tools.FigureFactory.create_scatterplotmatrix is deprecated. Use plotly.figure_factory.create_scatterplotmatrix


In [42]:
from patsy import dmatrices
patsy_formula = 'Total_Torrents ~ Prod_Budget + Year + Month + Runtime + Genre_Single'
y, x = dmatrices(patsy_formula, data=df_sub, return_type='dataframe')


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-42-892f5235bd9b> in <module>()
      1 from patsy import dmatrices
      2 patsy_formula = 'Total_Torrents ~ Prod_Budget + Year + Month + Runtime + Genre_Single'
----> 3 y, x = dmatrices(patsy_formula, data=df_sub, return_type='dataframe')

NameError: name 'df_sub' is not defined

In [43]:
import statsmodels.api as sm
model = sm.OLS(y, x)
results = model.fit()
results.summary()


/Users/bryant/anaconda/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning:

The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-43-57a6d5dfcd2e> in <module>()
      1 import statsmodels.api as sm
----> 2 model = sm.OLS(y, x)
      3 results = model.fit()
      4 results.summary()

NameError: name 'y' is not defined

In [44]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x, y)
mod_lr_score = model.score(x, y)
mod_lr_coef = model.coef_
model.results


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-44-1007d5ea6fda> in <module>()
      1 from sklearn.linear_model import LinearRegression
      2 model = LinearRegression()
----> 3 model.fit(x, y)
      4 mod_lr_score = model.score(x, y)
      5 mod_lr_coef = model.coef_

NameError: name 'x' is not defined

In [45]:
from sklearn import cross_validation as cv
from sklearn import metrics

x_train, x_test, y_train, y_test = cv.train_test_split(x,y,test_size=0.20,random_state=1234)

model = LinearRegression().fit(x_train, y_train)

# store results
mean_sq_err = metrics.mean_squared_error(y_train,model.predict(x_train))
cv_mod_score = model.score(x_train, y_train)


/Users/bryant/anaconda/lib/python3.6/site-packages/sklearn/cross_validation.py:44: DeprecationWarning:

This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-45-0e20577e473f> in <module>()
      2 from sklearn import metrics
      3 
----> 4 x_train, x_test, y_train, y_test = cv.train_test_split(x,y,test_size=0.20,random_state=1234)
      5 
      6 model = LinearRegression().fit(x_train, y_train)

NameError: name 'x' is not defined

In [46]:
# reset x, y otherwise errors occur
y, x = dmatrices(patsy_formula, data=df_sub, return_type='dataframe')

from sklearn.cross_validation import KFold
kf = KFold(len(df_sub), n_folds=10, shuffle=True)

for train_index, test_index in kf:
    x_train, x_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    clf2 = LinearRegression().fit(x.iloc[train_index], y.iloc[train_index])

# store results
mean_sq_errKf = metrics.mean_squared_error(y_train,model.predict(x_train))
cvKf_mod_score = clf2.score(x,y)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-46-7037a7dd77cd> in <module>()
      1 # reset x, y otherwise errors occur
----> 2 y, x = dmatrices(patsy_formula, data=df_sub, return_type='dataframe')
      3 
      4 from sklearn.cross_validation import KFold
      5 kf = KFold(len(df_sub), n_folds=10, shuffle=True)

NameError: name 'df_sub' is not defined

In [47]:
#NORMAL RESULTS
print('Model Linear Regression Score = {0}'.format(mod_lr_score))
print('            Mean Square Error = {0}'.format(mean_sq_err))
print(' Cross Validation Model Score = {0}'.format(cv_mod_score))
print('     Mean Squred Error K-Fold = {0}'.format(mean_sq_errKf))
print('Cross Val. K-Fold Model Score = {0}'.format(cvKf_mod_score))


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-47-7abd27bb02d6> in <module>()
      1 #NORMAL RESULTS
----> 2 print('Model Linear Regression Score = {0}'.format(mod_lr_score))
      3 print('            Mean Square Error = {0}'.format(mean_sq_err))
      4 print(' Cross Validation Model Score = {0}'.format(cv_mod_score))
      5 print('     Mean Squred Error K-Fold = {0}'.format(mean_sq_errKf))

NameError: name 'mod_lr_score' is not defined

In [48]:
fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_regress_exog(results,'Prod_Budget', fig=fig)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-48-eddc8013101e> in <module>()
----> 1 fig = plt.figure(figsize=(12,8))
      2 fig = sm.graphics.plot_regress_exog(results,'Prod_Budget', fig=fig)

NameError: name 'plt' is not defined

Log Transform


In [49]:
df.columns


Out[49]:
Index(['Title', 'Released', 'Prod_Budget', 'Dom_Gross', 'World_Gross', 'Rated',
       'Runtime', 'Genre', 'Director', 'Actors', 'Pirate_Count',
       'Torrentz_Count', 'Zoogle_Ver_Count', 'Year', 'Month', 'Total_Torrents',
       'Year_Int', 'Genre_Single', 'Genre_Rated', 'Gen_Rat_Run', 'Gen_Rat_Bud',
       'Gen_Sin', 'Log_Prod_Bud', 'Log_Runtime', 'Log_Ttl_Tor'],
      dtype='object')

In [50]:
df_sub['log_budg']=np.log(df_sub.Prod_Budget)
#df_sub['log_year']=np.log(df_sub.Year)
#df_sub['log_run']=np.log(df_sub.Runtime)
df_sub['log_tor']=np.log(df_sub.Total_Torrents)

trans = df_sub[['log_budg', 'Year', 'log_tor']]
plt.rcParams['figure.figsize'] = (15, 15)
pd.tools.plotting.scatter_matrix(trans)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-50-33979681c548> in <module>()
----> 1 df_sub['log_budg']=np.log(df_sub.Prod_Budget)
      2 #df_sub['log_year']=np.log(df_sub.Year)
      3 #df_sub['log_run']=np.log(df_sub.Runtime)
      4 df_sub['log_tor']=np.log(df_sub.Total_Torrents)
      5 

NameError: name 'df_sub' is not defined

In [51]:
log_patsy_formula = 'log_tor ~ log_budg + Year + Month'
y, x = dmatrices(log_patsy_formula, data=df_sub, return_type='dataframe')


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-51-0a4df121d3a0> in <module>()
      1 log_patsy_formula = 'log_tor ~ log_budg + Year + Month'
----> 2 y, x = dmatrices(log_patsy_formula, data=df_sub, return_type='dataframe')

NameError: name 'df_sub' is not defined

In [52]:
import plotly.plotly as py
from plotly.tools import FigureFactory as FF

df_a = df_sub[['log_budg', 'Year', 'Month', 'log_tor']]
fig = FF.create_scatterplotmatrix(df_a, diag='histogram', index='Month',
                                  height=800, width=800)
py.iplot(fig, filename='Histograms along Diagonal Subplots')


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-52-cfd7edd261f3> in <module>()
      2 from plotly.tools import FigureFactory as FF
      3 
----> 4 df_a = df_sub[['log_budg', 'Year', 'Month', 'log_tor']]
      5 fig = FF.create_scatterplotmatrix(df_a, diag='histogram', index='Month',
      6                                   height=800, width=800)

NameError: name 'df_sub' is not defined

In [53]:
import statsmodels.formula.api as smf
results = smf.ols(formula=log_patsy_formula, data=df_sub,).fit()
results.summary()


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-53-cfa98826c53d> in <module>()
      1 import statsmodels.formula.api as smf
----> 2 results = smf.ols(formula=log_patsy_formula, data=df_sub,).fit()
      3 results.summary()

NameError: name 'df_sub' is not defined

In [54]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x, y)

# store results
log_mod_lr_score = model.score(x,y)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-54-0ffa9ab46d74> in <module>()
      1 from sklearn.linear_model import LinearRegression
      2 model = LinearRegression()
----> 3 model.fit(x, y)
      4 
      5 # store results

NameError: name 'x' is not defined

In [55]:
from sklearn import cross_validation as cv
from sklearn import metrics

x_train, x_test, y_train, y_test = cv.train_test_split(x,y,test_size=0.20,random_state=1234)

model = LinearRegression().fit(x_train, y_train)

# store results
log_mean_sq_err = metrics.mean_squared_error(y_train,model.predict(x_train))
log_cv_mod_score = model.score(x_train, y_train)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-55-163b2c3f3b5b> in <module>()
      2 from sklearn import metrics
      3 
----> 4 x_train, x_test, y_train, y_test = cv.train_test_split(x,y,test_size=0.20,random_state=1234)
      5 
      6 model = LinearRegression().fit(x_train, y_train)

NameError: name 'x' is not defined

In [56]:
# reset x, y otherwise errors occur
y, x = dmatrices(log_patsy_formula, data=df_sub, return_type='dataframe')

from sklearn.cross_validation import KFold
kf = KFold(len(df_sub), n_folds=10, shuffle=True)

for train_index, test_index in kf:
    x_train, x_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    clf2 = LinearRegression().fit(x.iloc[train_index], y.iloc[train_index])

# store results
log_mean_sq_errKf = metrics.mean_squared_error(y_train,model.predict(x_train))
log_cvKf_mod_score = clf2.score(x,y)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-56-247f3fba12d9> in <module>()
      1 # reset x, y otherwise errors occur
----> 2 y, x = dmatrices(log_patsy_formula, data=df_sub, return_type='dataframe')
      3 
      4 from sklearn.cross_validation import KFold
      5 kf = KFold(len(df_sub), n_folds=10, shuffle=True)

NameError: name 'df_sub' is not defined

In [57]:
#LOG RESULTS
print('Log Model Linear Regression Score = {0}'.format(log_mod_lr_score))
print('            Log Mean Square Error = {0}'.format(log_mean_sq_err))
print(' Log Cross Validation Model Score = {0}'.format(log_cv_mod_score))
print('     Log Mean Squred Error K-Fold = {0}'.format(log_mean_sq_errKf))
print('Log Cross Val. K-Fold Model Score = {0}'.format(log_cvKf_mod_score))


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-57-9f95498a38ed> in <module>()
      1 #LOG RESULTS
----> 2 print('Log Model Linear Regression Score = {0}'.format(log_mod_lr_score))
      3 print('            Log Mean Square Error = {0}'.format(log_mean_sq_err))
      4 print(' Log Cross Validation Model Score = {0}'.format(log_cv_mod_score))
      5 print('     Log Mean Squred Error K-Fold = {0}'.format(log_mean_sq_errKf))

NameError: name 'log_mod_lr_score' is not defined

In [58]:
df_TEST = pd.read_csv('data/test_data2.csv', encoding='latin-1')

df_TEST['log_budg']=np.log(df_TEST.Prod_Budget)
df_TEST['log_run']=np.log(df_TEST.Runtime)
df_TEST['log_tor']=np.log(df_TEST.Total_Torrents)

def split_to_array(ser):
    split_array = np.array(ser.strip().replace(',','').split(' '))
    return pd.Series(split_array)

genres = df_yr.Genre.apply(split_to_array)
genres = pd.Series(genres.values.ravel()).dropna()
genres = genres.value_counts().sort_values(ascending=False)

def convert_frequency(ser, genres=genres):
    split_array = np.array(ser.strip().replace(',','').split(' '))
    genre = genres.loc[split_array].argmax()
    return genre

df_TEST['Genre_Single'] = df_TEST.Genre.apply(convert_frequency)

log_patsy_formula_test = 'log_tor ~ log_budg + Year + Month + log_run + Genre_Single'
y, x = dmatrices(log_patsy_formula_test, data=df_TEST, return_type='dataframe')

print(clf2.score(x_test, y_test))
print(metrics.mean_squared_error(y_test,model.predict(x_test)))


---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-58-89eac684872a> in <module>()
----> 1 df_TEST = pd.read_csv('data/test_data2.csv', encoding='latin-1')
      2 
      3 df_TEST['log_budg']=np.log(df_TEST.Prod_Budget)
      4 df_TEST['log_run']=np.log(df_TEST.Runtime)
      5 df_TEST['log_tor']=np.log(df_TEST.Total_Torrents)

~/anaconda/lib/python3.6/site-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
    653                     skip_blank_lines=skip_blank_lines)
    654 
--> 655         return _read(filepath_or_buffer, kwds)
    656 
    657     parser_f.__name__ = name

~/anaconda/lib/python3.6/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
    403 
    404     # Create the parser.
--> 405     parser = TextFileReader(filepath_or_buffer, **kwds)
    406 
    407     if chunksize or iterator:

~/anaconda/lib/python3.6/site-packages/pandas/io/parsers.py in __init__(self, f, engine, **kwds)
    762             self.options['has_index_names'] = kwds['has_index_names']
    763 
--> 764         self._make_engine(self.engine)
    765 
    766     def close(self):

~/anaconda/lib/python3.6/site-packages/pandas/io/parsers.py in _make_engine(self, engine)
    983     def _make_engine(self, engine='c'):
    984         if engine == 'c':
--> 985             self._engine = CParserWrapper(self.f, **self.options)
    986         else:
    987             if engine == 'python':

~/anaconda/lib/python3.6/site-packages/pandas/io/parsers.py in __init__(self, src, **kwds)
   1603         kwds['allow_leading_cols'] = self.index_col is not False
   1604 
-> 1605         self._reader = parsers.TextReader(src, **kwds)
   1606 
   1607         # XXX

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader.__cinit__ (pandas/_libs/parsers.c:4209)()

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._setup_parser_source (pandas/_libs/parsers.c:8873)()

FileNotFoundError: File b'data/test_data2.csv' does not exist

In [59]:
_ = plt.plot(y, model.predict(x), 'bo')


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-59-08cddbda409d> in <module>()
----> 1 _ = plt.plot(y, model.predict(x), 'bo')

NameError: name 'plt' is not defined

In [60]:
plt.figure(figsize=(25,10))

ind = np.arange(len(yr_dict))
width = 0.35

bar_year = [year for year, count in yr_lst]
bar_count = [count for year, count in yr_lst]

plt.bar(ind, bar_count, width, color='r')

plt.ylabel('Count')
plt.xlabel('Year')
plt.title('Number of Torrents per Year')
plt.xticks(ind + width/2., (bar_year), rotation='vertical')
plt.yticks(np.arange(0, 91, 5))

plt.show()


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-60-0ae9a7bddc76> in <module>()
----> 1 plt.figure(figsize=(25,10))
      2 
      3 ind = np.arange(len(yr_dict))
      4 width = 0.35
      5 

NameError: name 'plt' is not defined

In [61]:
#log_tor ~ log_budg + Year + Month + log_run + Genre_Single'

In [62]:
fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_regress_exog(results,'log_budg', fig=fig)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-62-37654b6c542c> in <module>()
----> 1 fig = plt.figure(figsize=(12,8))
      2 fig = sm.graphics.plot_regress_exog(results,'log_budg', fig=fig)

NameError: name 'plt' is not defined

In [63]:
fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_regress_exog(results,'Year', fig=fig)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-63-32590a27fb0c> in <module>()
----> 1 fig = plt.figure(figsize=(12,8))
      2 fig = sm.graphics.plot_regress_exog(results,'Year', fig=fig)

NameError: name 'plt' is not defined

In [64]:
fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_regress_exog(results,'Month', fig=fig)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-64-e8bca8164bd5> in <module>()
----> 1 fig = plt.figure(figsize=(12,8))
      2 fig = sm.graphics.plot_regress_exog(results,'Month', fig=fig)

NameError: name 'plt' is not defined

In [ ]:


In [ ]: