In [36]:
import os
import pandas as pd
import numpy as np

import itertools
import operator
import string
from collections import defaultdict, Counter

import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.tools import FigureFactory as FF

Plotly Setup


In [46]:
# plotly username and password
with open('../_credentials/plotly.txt', 'r') as infile:
    user, pw = infile.read().strip().split(', ')
    
plotly.tools.set_credentials_file(username=user, api_key=pw)

text_color = 'rgb(107, 107, 107)'

colors_dict = {'grey':'rgb(189, 195, 199)', 'aqua':'rgb( 54, 215, 183)', 'navy':'rgb( 31,  58, 147)',
            'purple':'rgb(142,  68, 173)', 'blue':'rgb( 25, 181, 254)', 'green':'rgb( 46, 204, 113)',
            'yellow':'rgb(253, 231,  76)', 'orange':'rgb(250, 121,  33)', 'red':'rgb(242,  38,  19)'}

colors_lst = [colors_dict['yellow'], colors_dict['orange'], colors_dict['red'], 
              colors_dict['green'], colors_dict['blue'], colors_dict['purple'], 
              colors_dict['navy'], colors_dict['aqua'], colors_dict['grey']]

Load Cleaned Data


In [47]:
from src.my_aws import S3

KEY_OMDB_TOR = 'OMDB_Torrents.csv'
KEY_NUM = 'TheNumbers_budgets.csv'
KEY_FINAL = 'Final_Data.csv'
BUCKET = 'movie-torrents'

s3_obj = S3()
df = s3_obj.get_data(KEY_FINAL, BUCKET)

# update dates to datetime objects
df['Released'] = pd.to_datetime(df['Released'])
df['Year'] = pd.to_numeric(pd.DatetimeIndex(df['Released']).year)
df['Month'] = pd.to_numeric(pd.DatetimeIndex(df['Released']).month)

Number of Films Released Annually


In [48]:
# number of titles per year in dataset
df_yr = df['Year'].value_counts().reset_index()
df_yr.columns = ['Year','Count']

yr_start = df['Year'].min(axis=0)
yr_stop = df['Year'].max(axis=0)
trace = go.Bar(x=df_yr['Year'], y=df_yr['Count'], marker=dict(color=colors_dict['red']))

In [49]:
data = [trace]
layout = go.Layout(
    title='Movies Released Annually ({0}-{1})'.format(yr_start, yr_stop),
    xaxis=dict(
        title='Release Year',
        tickfont=dict(size=14, color=text_color)),
    yaxis=dict(
        title='Number of Movies',
        titlefont=dict(size=16, color=text_color),
        tickfont=dict(size=14, color=text_color)),
    barmode='group',
    bargap=0.15,
    bargroupgap=0.1)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='style-bar')


Out[49]:

Trim Dataset by Years of Interest


In [50]:
def df_year_limit(start, stop, df):
    mask = (df['Year'] >= start) & (df['Year'] <= stop)
    df = df.loc[mask]
    return df

In [54]:
yr_start = 1940
yr_stop = 2015

len_before = len(df)
df = df_year_limit(yr_start, yr_stop, df)
len_after = len(df)

print('{0} entries lost ({1}%) due to date cutoff between {2} and {3}'.format(
    len_before-len_after, round((len_before-len_after)/len_before *100, 2), yr_start, yr_stop))


72 entries lost (1.86%) due to date cutoff between 1940 and 2015

In [55]:
# number of titles per year in dataset 
df_yr = df['Year'].value_counts().reset_index()
df_yr.columns = ['Year','Count']

trace = go.Bar(x=df_yr['Year'], y=df_yr['Count'], marker=dict(color=colors_dict['orange']))

In [56]:
data = [trace]
layout = go.Layout(
    title='Movies Released Annually ({0}-{1})'.format(yr_start, yr_stop),
    xaxis=dict(
        title='Release Year',
        tickfont=dict(size=14,color=text_color)),
    yaxis=dict(
        title='Number of Movies',
        titlefont=dict(size=16, color=text_color),
        tickfont=dict(size=14, color=text_color)),
    barmode='group',
    bargap=0.15,
    bargroupgap=0.1)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='style-bar')


Out[56]:

Number of Films per Genre


In [57]:
# split genre strings into a numpy array
def split_to_array(ser):
    split_array = np.array(ser.strip().replace(',','').split(' '))
    return pd.Series(split_array)

# turn numpy array into count of genre occurances
genres = df['Genre'].apply(split_to_array)
genres = pd.Series(genres.values.ravel()).dropna()
genres = genres.value_counts().sort_values(ascending=False)

# convert series to dataframe for plotting
genre_ser = genres.reset_index()
genre_ser.columns = ['Genre', 'Count']

# bar chart of each genre in dataset
trace = go.Bar(x=genre_ser['Genre'], y=genre_ser['Count'], marker=dict(color=colors_dict['yellow']))

In [58]:
data = [trace]
layout = go.Layout(
    title='Genre Occurences ({0}-{1})'.format(yr_start, yr_stop),
    xaxis=dict(
        title='Genre',
        tickfont=dict(size=14, color=text_color)),
    yaxis=dict(
        title='Occurences in Classification',
        titlefont=dict(size=16, color=text_color),
        tickfont=dict(size=14, color=text_color)),
    barmode='group',
    bargap=0.15,
    bargroupgap=0.1)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='style-bar')


Out[58]:

Add Column for Genre That Occurs the Most out of Genres String


In [59]:
def convert_frequency(ser, genres=genres):
    split_array = np.array(ser.strip().replace(',','').split(' '))
    genre = genres.loc[split_array].argmax()
    return genre

# add new column to dataframe classifying genre list as single genre of significance
df['Genre_Single'] = df['Genre'].apply(convert_frequency)

In [60]:
# look at number of single genre counts after extraction
df_count = df['Genre_Single'].value_counts().reset_index()
df_count.columns = ['Genre_Single', 'Count']

# bar chart of significant single genre in dataset
trace = go.Bar(x=df_count['Genre_Single'], y=df_count['Count'], marker=dict(color=colors_dict['blue']))

In [61]:
data = [trace]
layout = go.Layout(
    title='Dominate Genre Count ({0}-{1})'.format(yr_start, yr_stop),
    xaxis=dict(
        title='Genre',
        tickfont=dict(size=14, color=text_color)),
    yaxis=dict(
        title='Genre Occurences',
        titlefont=dict(size=16, color=text_color),
        tickfont=dict(size=14, color=text_color)),
    barmode='group',
    bargap=0.15,
    bargroupgap=0.1)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='style-bar')


Out[61]:

Dominate Genres Breakdown by Year


In [64]:
def make_bar_trace(x_years, y_counts, name_genre, i):
    trace = go.Bar(
        x = x_years, 
        y = y_counts, 
        name = name_genre, 
#         marker=dict(
#             color=colors_lst[i]),
        opacity=0.8
        )
    return trace

In [65]:
traces = []

for i,genre in enumerate(df_count['Genre_Single'].unique().tolist()):
    _genre_df = df[df['Genre_Single'] == genre]
    _value_counts = _genre_df['Year'].value_counts()
    gen = _value_counts.to_dict()
    gen = sorted(gen.items())
    
    year_lst = [yr for yr,ct in gen]
    count_lst = [ct for yr,ct in gen]
    
    traces.append(make_bar_trace(year_lst, count_lst, genre, i))

In [66]:
data = traces[::-1]

layout = go.Layout(
    title='Genres Annually ({0}-{1})'.format(yr_start, yr_stop),
    xaxis=dict(
        title='Year',
        tickfont=dict(
            size=14,
            color='rgb(107, 107, 107)'
        )
    ),
    yaxis=dict(
        title='Number of Films',
        titlefont=dict(
            size=16,
            color='rgb(107, 107, 107)'
        ),
        tickfont=dict(
            size=14,
            color='rgb(107, 107, 107)'
        ),
        dtick=20,
    ),
    barmode='stack',
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='stacked-bar')


Out[66]:

In [67]:
def fill_missing_year(tup_lst):
    years = df['Year'].sort_values().unique().tolist()
    
    itms = [yr for yr, ct in tup_lst]
    mis_year = set(years) - set(itms)
    
    for yr in mis_year:
        tup_lst.append((yr, 0))
        
    return tup_lst

In [68]:
def tri_tup_dict():
    temp = {}

    for i, genre in enumerate(df_count['Genre_Single'].unique().tolist()):
        _genre_df = df[df['Genre_Single'] == genre]
        _value_counts = _genre_df['Year'].value_counts()
        gen = _value_counts.to_dict()
        gen = sorted(gen.items())

        # fill in years with zero counts
        fill_missing_year(gen)

        # create triple tuple of genre, year, count
        years = [yr for yr, ct in gen]
        counts = [ct for yr, ct in gen]
        genres = [genre] * len(gen)
        tri_tup = zip(genres, years, counts)

        # create dictionary keyed on genre and year with genre counts as value
        for tup in tri_tup:
            temp[(tup[0], tup[1])] = tup[2]
    
    return temp

In [69]:
def percentage_dict(tri_dict):
    years = [y for y in range(df['Year'].min(), df['Year'].max()+1)]
    
    # by year so that year stack adds up to 100%
    for year in years:
        temp = {}

        # get subset (year) dictionary
        for key,ct in tri_dict.items():
            if year in key:
                temp[key] = ct

        # get total for year subset
        total = sum(temp.values())

        # put percentages back into main dictionary
        for key,ct in temp.items():
            tri_dict[key] = ct/total*100
    
    return tri_dict

In [70]:
def normalized_check(_dict):
    years = [y for y in range(df['Year'].min(), df['Year'].max()+1)]
    
    for year in years:
        temp = []
        for key,ct in _dict.items():
            if year in key:
                temp.append(ct)
        print('For {0} the total is: {1}'.format(year, sum(temp)))

In [71]:
# normalized_check(percentage_dict(tri_tup_dict()))

In [72]:
def make_filled_line_trace(x_years, y_counts, name_genre, color):
    trace = go.Scatter(
        x = x_years, 
        y = y_counts, 
        name = name_genre,
        mode = 'lines',
        line = dict(width=0.5,
                   color = color),
        fill = 'tonexty')
    return trace

In [73]:
ord_lst = ['Crime', 'Adventure', 'Thriller', 'Horror', 'Action', 'Comedy', 'Drama']

new_dict = {}
ttl = np.array(0)
for _gen in ord_lst:
    temp = []
    for tup in sorted(percentage_dict(tri_tup_dict()).items()):
        if _gen in tup[0]:
            temp.append(tup[1])
    ttl = ttl + temp
    new_dict[_gen] = ttl

In [74]:
traces = []
ord_lst = ['Crime', 'Adventure', 'Thriller', 'Horror', 'Action', 'Comedy', 'Drama']
years = [y for y in range(df['Year'].min(), df['Year'].max()+1)]

for i, genre in enumerate(ord_lst):
    yr_lst = years
    ct_lst = new_dict[genre].tolist()

    traces.append(make_filled_line_trace(yr_lst, ct_lst, genre, colors_lst[i]))

In [75]:
data = traces

layout = go.Layout(
    title='Normalized Genres Annually ({0}-{1})'.format(yr_start, yr_stop),
    showlegend=True,
    autosize=False,
    width=1000,
    height=600,
    margin=go.Margin(
        l=60,
        r=40,
        b=100,
        t=100,
        pad=4
    ),
    xaxis=dict(
        title='Year',
        tickfont=dict(
        size=14,
        color='rgb(107,107,107)'),
        type='category',
        dtick=2
    ),
    yaxis=dict(
        title='Normalized Film Count % (Accumulative)',
        titlefont=dict(
        size=16,
        color='rgb(107,107,107)'),
        tickfont=dict(
        size=14,
        color='rgb(107,107,107)'),
        type='linear',
        range=[1, 100],
        dtick=20,
        ticksuffix='%'
    )
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='stacked-area-plot')


Out[75]:

Remove Films Not Rated - PG-13, PG, G, or R


In [76]:
len_before = len(df)

ratings = ['PG-13', 'PG', 'G', 'R']
df = df.loc[df['Rated'].isin(ratings)]

len_after = len(df)

print('{0} entries lost ({1}%) due to limiting to only {2} ratings'.format(len_before-len_after, 
                                                round((len_before-len_after)/len_before *100, 2), ', '.join(ratings)))


306 entries lost (0.03%) due to limiting to only PG-13, PG, G, R ratings

In [83]:
from patsy import dmatrices
patsy_formula = 'Total_Torrents ~ ProductionBudget + Year + Month + Runtime + Genre_Single'
y, x = dmatrices(patsy_formula, data=df, return_type='dataframe')

In [84]:
import statsmodels.api as sm
model = sm.OLS(y, x)
results = model.fit()
results.summary()


/Users/bryant/anaconda/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning:

The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.

Out[84]:
OLS Regression Results
Dep. Variable: Total_Torrents R-squared: 0.021
Model: OLS Adj. R-squared: 0.015
Method: Least Squares F-statistic: 3.282
Date: Tue, 19 Sep 2017 Prob (F-statistic): 2.08e-07
Time: 16:07:41 Log-Likelihood: -30308.
No. Observations: 3499 AIC: 6.066e+04
Df Residuals: 3475 BIC: 6.081e+04
Df Model: 23
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
Intercept -1.529e+04 5297.081 -2.886 0.004 -2.57e+04 -4901.242
Genre_Single[T.Adventure] -124.9848 177.203 -0.705 0.481 -472.418 222.449
Genre_Single[T.Animation] -325.8469 995.677 -0.327 0.743 -2278.019 1626.325
Genre_Single[T.Biography] -86.7900 814.236 -0.107 0.915 -1683.218 1509.638
Genre_Single[T.Comedy] -1.7258 81.731 -0.021 0.983 -161.971 158.520
Genre_Single[T.Crime] -140.1793 237.757 -0.590 0.556 -606.337 325.979
Genre_Single[T.Documentary] 577.0270 397.604 1.451 0.147 -202.534 1356.588
Genre_Single[T.Drama] -124.2358 79.136 -1.570 0.117 -279.394 30.923
Genre_Single[T.Family] -311.0167 995.660 -0.312 0.755 -2263.154 1641.121
Genre_Single[T.Fantasy] -90.4315 578.317 -0.156 0.876 -1224.306 1043.443
Genre_Single[T.History] -268.3950 995.633 -0.270 0.788 -2220.479 1683.689
Genre_Single[T.Horror] -46.7695 167.960 -0.278 0.781 -376.080 282.541
Genre_Single[T.Music] -191.0692 474.178 -0.403 0.687 -1120.765 738.627
Genre_Single[T.Musical] -333.0652 1405.640 -0.237 0.813 -3089.029 2422.899
Genre_Single[T.Mystery] 81.5917 1406.814 0.058 0.954 -2676.673 2839.857
Genre_Single[T.Romance] -142.0503 411.427 -0.345 0.730 -948.714 664.613
Genre_Single[T.Sci-Fi] 35.3654 996.250 0.035 0.972 -1917.930 1988.661
Genre_Single[T.Sport] -260.6294 995.444 -0.262 0.793 -2212.343 1691.084
Genre_Single[T.Thriller] -35.1654 142.373 -0.247 0.805 -314.308 243.977
Genre_Single[T.Western] -141.9018 813.840 -0.174 0.862 -1737.555 1453.751
ProductionBudget 3.547e-06 6.96e-07 5.097 0.000 2.18e-06 4.91e-06
Year 7.7225 2.628 2.939 0.003 2.571 12.874
Month -2.5716 6.927 -0.371 0.710 -16.154 11.010
Runtime 1.1146 1.401 0.796 0.426 -1.632 3.861
Omnibus: 9272.558 Durbin-Watson: 2.006
Prob(Omnibus): 0.000 Jarque-Bera (JB): 203131926.646
Skew: 31.014 Prob(JB): 0.00
Kurtosis: 1181.752 Cond. No. 1.26e+10

In [90]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x, y)
mod_lr_score = model.score(x, y)
mod_lr_coef = model.coef_
mod_lr_score


Out[90]:
0.021260526233010221

In [91]:
from sklearn import cross_validation as cv
from sklearn import metrics

x_train, x_test, y_train, y_test = cv.train_test_split(x,y,test_size=0.20,random_state=1234)

model = LinearRegression().fit(x_train, y_train)

# store results
mean_sq_err = metrics.mean_squared_error(y_train,model.predict(x_train))
cv_mod_score = model.score(x_train, y_train)


/Users/bryant/anaconda/lib/python3.6/site-packages/sklearn/cross_validation.py:41: DeprecationWarning:

This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.


In [94]:
# reset x, y otherwise errors occur
y, x = dmatrices(patsy_formula, data=df, return_type='dataframe')

from sklearn.cross_validation import KFold
kf = KFold(len(df), n_folds=10, shuffle=True)

for train_index, test_index in kf:
    x_train, x_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    clf2 = LinearRegression().fit(x.iloc[train_index], y.iloc[train_index])

# store results
mean_sq_errKf = metrics.mean_squared_error(y_train,model.predict(x_train))
cvKf_mod_score = clf2.score(x,y)

In [95]:
#NORMAL RESULTS
print('Model Linear Regression Score = {0}'.format(mod_lr_score))
print('            Mean Square Error = {0}'.format(mean_sq_err))
print(' Cross Validation Model Score = {0}'.format(cv_mod_score))
print('     Mean Squred Error K-Fold = {0}'.format(mean_sq_errKf))
print('Cross Val. K-Fold Model Score = {0}'.format(cvKf_mod_score))


Model Linear Regression Score = 0.02126052623301022
            Mean Square Error = 2040304.0674375007
 Cross Validation Model Score = 0.020571788946668668
     Mean Squred Error K-Fold = 2089663.6995366786
Cross Val. K-Fold Model Score = 0.021048862033185012

In [103]:
% matplotlib inline
from matplotlib import pyplot as plt
fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_regress_exog(results,'ProductionBudget', fig=fig)



Log Transform


In [104]:
df.columns


Out[104]:
Index(['Actors', 'Awards', 'BoxOffice', 'Country', 'DVD', 'Director', 'Genre',
       'Language', 'Metascore', 'Production', 'Rated', 'Released', 'Runtime',
       'Title', 'Type', 'Writer', 'imdbID', 'imdbRating', 'imdbVotes',
       'Kat_Count', 'Pirate_Count', 'Extra_Count', 'Torrentz_Count',
       'Torrentz_Ver_Count', 'Zoogle_Ver_Count', 'ProductionBudget',
       'DomesticBudget', 'WorldGross', 'Total_Torrents', 'Year', 'Month',
       'Genre_Single'],
      dtype='object')

In [106]:
df['log_budg']=np.log(df['ProductionBudget'])
#df_sub['log_year']=np.log(df_sub.Year)
#df_sub['log_run']=np.log(df_sub.Runtime)
df['log_tor']=np.log(df['Total_Torrents'])

trans = df[['log_budg', 'Year', 'log_tor']]
plt.rcParams['figure.figsize'] = (15, 15)
pd.tools.plotting.scatter_matrix(trans)


/Users/bryant/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:4: RuntimeWarning:

divide by zero encountered in log

/Users/bryant/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:8: FutureWarning:

'pandas.tools.plotting.scatter_matrix' is deprecated, import 'pandas.plotting.scatter_matrix' instead.

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-106-1c17164d9e39> in <module>()
      6 trans = df[['log_budg', 'Year', 'log_tor']]
      7 plt.rcParams['figure.figsize'] = (15, 15)
----> 8 pd.tools.plotting.scatter_matrix(trans)

~/anaconda/lib/python3.6/site-packages/pandas/tools/plotting.py in wrapper(*args, **kwargs)
     15                           "import 'pandas.plotting.{t}' instead.".format(t=t),
     16                           FutureWarning, stacklevel=2)
---> 17             return getattr(_plotting, t)(*args, **kwargs)
     18         return wrapper
     19 

~/anaconda/lib/python3.6/site-packages/pandas/plotting/_misc.py in scatter_matrix(frame, alpha, figsize, ax, grid, diagonal, marker, density_kwds, hist_kwds, range_padding, **kwds)
     89                 # Deal with the diagonal by drawing a histogram there.
     90                 if diagonal == 'hist':
---> 91                     ax.hist(values, **hist_kwds)
     92 
     93                 elif diagonal in ('kde', 'density'):

~/anaconda/lib/python3.6/site-packages/matplotlib/__init__.py in inner(ax, *args, **kwargs)
   1896                     warnings.warn(msg % (label_namer, func.__name__),
   1897                                   RuntimeWarning, stacklevel=2)
-> 1898             return func(ax, *args, **kwargs)
   1899         pre_doc = inner.__doc__
   1900         if pre_doc is None:

~/anaconda/lib/python3.6/site-packages/matplotlib/axes/_axes.py in hist(***failed resolving arguments***)
   6193             # this will automatically overwrite bins,
   6194             # so that each histogram uses the same bins
-> 6195             m, bins = np.histogram(x[i], bins, weights=w[i], **hist_kwargs)
   6196             m = m.astype(float)  # causes problems later if it's an int
   6197             if mlast is None:

~/anaconda/lib/python3.6/site-packages/numpy/lib/function_base.py in histogram(a, bins, range, normed, weights, density)
    668     if not np.all(np.isfinite([mn, mx])):
    669         raise ValueError(
--> 670             'range parameter must be finite.')
    671     if mn == mx:
    672         mn -= 0.5

ValueError: range parameter must be finite.

In [108]:
log_patsy_formula = 'log_tor ~ log_budg + Year + Month'
y, x = dmatrices(log_patsy_formula, data=df, return_type='dataframe')

In [109]:
import plotly.plotly as py
from plotly.tools import FigureFactory as FF

df_a = df[['log_budg', 'Year', 'Month', 'log_tor']]
fig = FF.create_scatterplotmatrix(df_a, diag='histogram', index='Month',
                                  height=800, width=800)
py.iplot(fig, filename='Histograms along Diagonal Subplots')


/Users/bryant/anaconda/lib/python3.6/site-packages/plotly/tools.py:1422: UserWarning:

plotly.tools.FigureFactory.create_scatterplotmatrix is deprecated. Use plotly.figure_factory.create_scatterplotmatrix

Out[109]:

In [110]:
import statsmodels.formula.api as smf
results = smf.ols(formula=log_patsy_formula, data=df,).fit()
results.summary()


/Users/bryant/anaconda/lib/python3.6/site-packages/statsmodels/regression/linear_model.py:1366: RuntimeWarning:

invalid value encountered in subtract

/Users/bryant/anaconda/lib/python3.6/site-packages/scipy/stats/_distn_infrastructure.py:879: RuntimeWarning:

invalid value encountered in greater

/Users/bryant/anaconda/lib/python3.6/site-packages/scipy/stats/_distn_infrastructure.py:879: RuntimeWarning:

invalid value encountered in less

/Users/bryant/anaconda/lib/python3.6/site-packages/scipy/stats/_distn_infrastructure.py:1818: RuntimeWarning:

invalid value encountered in less_equal

Out[110]:
OLS Regression Results
Dep. Variable: log_tor R-squared: nan
Model: OLS Adj. R-squared: nan
Method: Least Squares F-statistic: nan
Date: Tue, 19 Sep 2017 Prob (F-statistic): nan
Time: 16:13:07 Log-Likelihood: nan
No. Observations: 3499 AIC: nan
Df Residuals: 3495 BIC: nan
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
Intercept nan nan nan nan nan nan
log_budg inf nan nan nan nan nan
Year nan nan nan nan nan nan
Month nan nan nan nan nan nan
Omnibus: nan Durbin-Watson: nan
Prob(Omnibus): nan Jarque-Bera (JB): nan
Skew: nan Prob(JB): nan
Kurtosis: nan Cond. No. 4.11e+05

In [ ]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x, y)

# store results
log_mod_lr_score = model.score(x,y)

In [ ]:
from sklearn import cross_validation as cv
from sklearn import metrics

x_train, x_test, y_train, y_test = cv.train_test_split(x,y,test_size=0.20,random_state=1234)

model = LinearRegression().fit(x_train, y_train)

# store results
log_mean_sq_err = metrics.mean_squared_error(y_train,model.predict(x_train))
log_cv_mod_score = model.score(x_train, y_train)

In [ ]:
# reset x, y otherwise errors occur
y, x = dmatrices(log_patsy_formula, data=df_sub, return_type='dataframe')

from sklearn.cross_validation import KFold
kf = KFold(len(df_sub), n_folds=10, shuffle=True)

for train_index, test_index in kf:
    x_train, x_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    clf2 = LinearRegression().fit(x.iloc[train_index], y.iloc[train_index])

# store results
log_mean_sq_errKf = metrics.mean_squared_error(y_train,model.predict(x_train))
log_cvKf_mod_score = clf2.score(x,y)

In [ ]:
#LOG RESULTS
print('Log Model Linear Regression Score = {0}'.format(log_mod_lr_score))
print('            Log Mean Square Error = {0}'.format(log_mean_sq_err))
print(' Log Cross Validation Model Score = {0}'.format(log_cv_mod_score))
print('     Log Mean Squred Error K-Fold = {0}'.format(log_mean_sq_errKf))
print('Log Cross Val. K-Fold Model Score = {0}'.format(log_cvKf_mod_score))

In [ ]:
df_TEST = pd.read_csv('data/test_data2.csv', encoding='latin-1')

df_TEST['log_budg']=np.log(df_TEST.Prod_Budget)
df_TEST['log_run']=np.log(df_TEST.Runtime)
df_TEST['log_tor']=np.log(df_TEST.Total_Torrents)

def split_to_array(ser):
    split_array = np.array(ser.strip().replace(',','').split(' '))
    return pd.Series(split_array)

genres = df_yr.Genre.apply(split_to_array)
genres = pd.Series(genres.values.ravel()).dropna()
genres = genres.value_counts().sort_values(ascending=False)

def convert_frequency(ser, genres=genres):
    split_array = np.array(ser.strip().replace(',','').split(' '))
    genre = genres.loc[split_array].argmax()
    return genre

df_TEST['Genre_Single'] = df_TEST.Genre.apply(convert_frequency)

log_patsy_formula_test = 'log_tor ~ log_budg + Year + Month + log_run + Genre_Single'
y, x = dmatrices(log_patsy_formula_test, data=df_TEST, return_type='dataframe')

print(clf2.score(x_test, y_test))
print(metrics.mean_squared_error(y_test,model.predict(x_test)))

In [ ]:
_ = plt.plot(y, model.predict(x), 'bo')

In [ ]:
plt.figure(figsize=(25,10))

ind = np.arange(len(yr_dict))
width = 0.35

bar_year = [year for year, count in yr_lst]
bar_count = [count for year, count in yr_lst]

plt.bar(ind, bar_count, width, color='r')

plt.ylabel('Count')
plt.xlabel('Year')
plt.title('Number of Torrents per Year')
plt.xticks(ind + width/2., (bar_year), rotation='vertical')
plt.yticks(np.arange(0, 91, 5))

plt.show()

In [ ]:
#log_tor ~ log_budg + Year + Month + log_run + Genre_Single'

In [ ]:
fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_regress_exog(results,'log_budg', fig=fig)

In [ ]:
fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_regress_exog(results,'Year', fig=fig)

In [ ]:
fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_regress_exog(results,'Month', fig=fig)
# 1998 mask = (df_yr['Year'] == 1998) & (df_yr['Total_Torrents'] > 100) df_yr = df_yr[~mask] # 1999 mask = (df_yr['Year'] == 1999) & (df_yr['Total_Torrents'] > 200) df_yr = df_yr[~mask] # 2000 mask = (df_yr['Year'] == 2000) & (df_yr['Total_Torrents'] > 200) df_yr = df_yr[~mask] # 2001 mask = (df_yr['Year'] == 2001) & (df_yr['Total_Torrents'] > 300) df_yr = df_yr[~mask] # 2002 mask = (df_yr['Year'] == 2002) & (df_yr['Total_Torrents'] > 300) df_yr = df_yr[~mask] # 2003 mask = (df_yr['Year'] == 2003) & (df_yr['Total_Torrents'] > 300) df_yr = df_yr[~mask] # 2004 # 2005 mask = (df_yr['Year'] == 2005) & (df_yr['Total_Torrents'] > 300) df_yr = df_yr[~mask] # 2006 mask = (df_yr['Year'] == 2006) & (df_yr['Total_Torrents'] > 300) df_yr = df_yr[~mask] # 2007 mask = (df_yr['Year'] == 2007) & (df_yr['Total_Torrents'] > 450) df_yr = df_yr[~mask] # 2010 mask = (df_yr['Year'] == 2010) & (df_yr['Total_Torrents'] > 800) df_yr = df_yr[~mask] # 2014 mask = (df_yr['Year'] == 2014) & (df_yr['Total_Torrents'] > 850) df_yr = df_yr[~mask]

In [ ]:


In [ ]: