notebook.community

Edit and run



In [1]:

    
import pandas as pd
import numpy as np
import string
from collections import defaultdict
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
matplotlib.style.use('ggplot')



In [2]:

    
df = pd.read_csv('data/data.csv', encoding='latin-1')
df = df[['Prod_Budget', 'Runtime', 'Total_Torrents']]
print(len(df))
print(df.head())









    



1664
   Prod_Budget  Runtime  Total_Torrents
0  425000000.0      162          1450.0
1  300000000.0      148           647.0
2  275000000.0      164           861.0
3  275000000.0      132           723.0
4  260000000.0      100           563.0



In [3]:

    
df['Runtime'].describe()









    Out[3]:





count    1664.000000
mean      110.026442
std        18.226754
min        72.000000
25%        97.000000
50%       107.000000
75%       119.000000
max       229.000000
Name: Runtime, dtype: float64



In [4]:

    
buckets = [0, 80, 100, 120, 140, 160, 180, 200, 300]
labels = ['0-80', '80-100', '100-120', '120-140', '140-160', '160-180', '180-200', '200+']

df['Runtime_Bin'] = pd.cut(df['Runtime'], buckets, labels=labels)
df = df.sort_values(['Runtime'], ascending=True)

print(df['Runtime_Bin'].value_counts())









    



100-120    716
80-100     545
120-140    309
140-160     58
160-180     18
180-200      8
0-80         7
200+         3
Name: Runtime_Bin, dtype: int64



In [5]:

    
# dictionary - runtime bucket counts
duration = df['Runtime_Bin'].value_counts().to_dict()

import operator
srt = sorted(duration.items(), key=operator.itemgetter(1))
srt = srt[::-1]
print(srt)









    



[('100-120', 716), ('80-100', 545), ('120-140', 309), ('140-160', 58), ('160-180', 18), ('180-200', 8), ('0-80', 7), ('200+', 3)]



In [6]:

    
plt.figure(figsize=(25,10))

ind = np.arange(len(duration)) # the x locations for the groups
width = 0.35 # the width of the bars

bar_bin = [n_bin for n_bin, count in srt]
bar_count = [count for n_bin, count in srt]

plt.bar(ind, bar_count, width, color='r')

plt.ylabel('Count')
plt.xlabel('Duration Bin')
plt.title('Number of Movie Duration per Binned Time')
plt.xticks(ind + width/2., (bar_bin), rotation='vertical')
plt.yticks(np.arange(0, 801, 100))

plt.show()



In [7]:

    
term = '100-120'
df_a = df.loc[df['Runtime_Bin'] == term]
print(len(df_a))

plt.figure(figsize=(15,10))
plt.axis([0, 450000000, 0, 1500])
plt.xlabel('Production Budget')
plt.ylabel('Torrents')
plt.title('Torrents vs. Production Budget - {0}'.format(term))
_ = plt.scatter(df_a['Prod_Budget'], df_a['Total_Torrents'], alpha=0.5, s=50)



In [8]:

    
term = '80-100'
df_a = df.loc[df['Runtime_Bin'] == term]
print(len(df_a))

plt.figure(figsize=(15,10))
plt.axis([0, 450000000, 0, 1500])
plt.xlabel('Production Budget')
plt.ylabel('Torrents')
plt.title('Torrents vs. Production Budget - {0}'.format(term))
_ = plt.scatter(df_a['Prod_Budget'], df_a['Total_Torrents'], alpha=0.5, s=50)



In [9]:

    
term = '120-140'
df_a = df.loc[df['Runtime_Bin'] == term]
print(len(df_a))

plt.figure(figsize=(15,10))
plt.axis([0, 450000000, 0, 1500])
plt.xlabel('Production Budget')
plt.ylabel('Torrents')
plt.title('Torrents vs. Production Budget - {0}'.format(term))
_ = plt.scatter(df_a['Prod_Budget'], df_a['Total_Torrents'], alpha=0.5, s=50)



In [10]:

    
df_sub = df[['Prod_Budget', 'Runtime', 'Runtime_Bin', 'Total_Torrents']]
df_sub.describe()









    Out[10]:







  
    
      
      Prod_Budget
      Runtime
      Total_Torrents
    
  
  
    
      count
      1.664000e+03
      1664.000000
      1664.000000
    
    
      mean
      4.773100e+07
      110.026442
      223.215745
    
    
      std
      4.429420e+07
      18.226754
      196.328069
    
    
      min
      6.800000e+04
      72.000000
      17.000000
    
    
      25%
      1.850000e+07
      97.000000
      89.000000
    
    
      50%
      3.500000e+07
      107.000000
      158.000000
    
    
      75%
      6.000000e+07
      119.000000
      298.000000
    
    
      max
      4.250000e+08
      229.000000
      1450.000000



In [11]:

    
# Lets take a look at our entire model
plt.rcParams['figure.figsize'] = (12, 12)
_ = pd.tools.plotting.scatter_matrix(df_sub)









    



/Users/bryant/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:3: FutureWarning: 'pandas.tools.plotting.scatter_matrix' is deprecated, import 'pandas.plotting.scatter_matrix' instead.
  app.launch_new_instance()



In [12]:

    
# select only runtime bins of significance
duration = ['80-100', '100-120', '120-140']
df_sub = df_sub.loc[df_sub['Runtime_Bin'].isin(duration)]
df_sub.describe()









    Out[12]:







  
    
      
      Prod_Budget
      Runtime
      Total_Torrents
    
  
  
    
      count
      1.570000e+03
      1570.000000
      1570.000000
    
    
      mean
      4.529649e+07
      107.492994
      217.320382
    
    
      std
      4.070365e+07
      13.846661
      189.354042
    
    
      min
      6.800000e+04
      81.000000
      17.000000
    
    
      25%
      1.800000e+07
      97.000000
      85.250000
    
    
      50%
      3.500000e+07
      106.000000
      153.000000
    
    
      75%
      6.000000e+07
      117.000000
      291.000000
    
    
      max
      2.750000e+08
      140.000000
      1291.000000



In [13]:

    
from patsy import dmatrices

y, x = dmatrices('Total_Torrents ~ Prod_Budget + Runtime_Bin', data=df_sub, return_type='dataframe')



In [14]:

    
y.head()









    Out[14]:







  
    
      
      Total_Torrents
    
  
  
    
      1662
      198.0
    
    
      1265
      40.0
    
    
      961
      281.0
    
    
      375
      258.0
    
    
      1071
      276.0



In [15]:

    
x.head()









    Out[15]:







  
    
      
      Intercept
      Runtime_Bin[T.80-100]
      Runtime_Bin[T.100-120]
      Runtime_Bin[T.120-140]
      Runtime_Bin[T.140-160]
      Runtime_Bin[T.160-180]
      Runtime_Bin[T.180-200]
      Runtime_Bin[T.200+]
      Prod_Budget
    
  
  
    
      1662
      1.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      100000.0
    
    
      1265
      1.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      18000000.0
    
    
      961
      1.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      30000000.0
    
    
      375
      1.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      65000000.0
    
    
      1071
      1.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      25000000.0



In [16]:

    
import statsmodels.api as sm
model = sm.OLS(y, x)
results = model.fit()
results.summary()









    



/Users/bryant/anaconda/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools
/Users/bryant/anaconda/lib/python3.6/site-packages/statsmodels/regression/linear_model.py:1471: RuntimeWarning: divide by zero encountered in double_scalars
  return np.sqrt(eigvals[0]/eigvals[-1])
/Users/bryant/anaconda/lib/python3.6/site-packages/statsmodels/base/model.py:1036: RuntimeWarning: invalid value encountered in true_divide
  return self.params / self.bse
/Users/bryant/anaconda/lib/python3.6/site-packages/scipy/stats/_distn_infrastructure.py:879: RuntimeWarning: invalid value encountered in greater
  return (self.a < x) & (x < self.b)
/Users/bryant/anaconda/lib/python3.6/site-packages/scipy/stats/_distn_infrastructure.py:879: RuntimeWarning: invalid value encountered in less
  return (self.a < x) & (x < self.b)
/Users/bryant/anaconda/lib/python3.6/site-packages/scipy/stats/_distn_infrastructure.py:1818: RuntimeWarning: invalid value encountered in less_equal
  cond2 = cond0 & (x <= self.a)






    Out[16]:





OLS Regression Results

  Dep. Variable:      Total_Torrents     R-squared:             0.180 


  Model:                    OLS          Adj. R-squared:        0.179 


  Method:              Least Squares     F-statistic:           114.9 


  Date:              Sun, 13 Aug 2017    Prob (F-statistic):  3.12e-67 


  Time:                  15:23:53        Log-Likelihood:      -10304. 


  No. Observations:         1570         AIC:                2.062e+04


  Df Residuals:             1566         BIC:                2.064e+04


  Df Model:                    3                                      


  Covariance Type:       nonrobust                                    




                            coef      std err       t       P>|t|   [0.025     0.975]  


  Intercept                  93.1120      5.239     17.774   0.000     82.836    103.388


  Runtime_Bin[T.80-100]      38.2716      6.238      6.135   0.000     26.035     50.508


  Runtime_Bin[T.100-120]     35.3639      5.770      6.129   0.000     24.047     46.681


  Runtime_Bin[T.120-140]     19.4765      8.276      2.353   0.019      3.244     35.709


  Runtime_Bin[T.140-160]           0          0        nan     nan          0          0


  Runtime_Bin[T.160-180]           0          0        nan     nan          0          0


  Runtime_Bin[T.180-200]           0          0        nan     nan          0          0


  Runtime_Bin[T.200+]              0          0        nan     nan          0          0


  Prod_Budget              2.008e-06    1.1e-07     18.338   0.000   1.79e-06   2.22e-06




  Omnibus:        534.136    Durbin-Watson:         2.041


  Prob(Omnibus):   0.000     Jarque-Bera (JB):   2052.114


  Skew:            1.625     Prob(JB):               0.00


  Kurtosis:        7.561     Cond. No.                inf



In [17]:

    
import statsmodels.formula.api as smf

results = smf.ols(formula='Total_Torrents ~ Prod_Budget + Runtime', data=df_sub,).fit()

results.summary()









    Out[17]:





OLS Regression Results

  Dep. Variable:      Total_Torrents     R-squared:             0.180 


  Model:                    OLS          Adj. R-squared:        0.179 


  Method:              Least Squares     F-statistic:           172.3 


  Date:              Sun, 13 Aug 2017    Prob (F-statistic):  2.39e-68 


  Time:                  15:23:53        Log-Likelihood:      -10304. 


  No. Observations:         1570         AIC:                2.061e+04


  Df Residuals:             1567         BIC:                2.063e+04


  Df Model:                    2                                      


  Covariance Type:       nonrobust                                    




                 coef      std err       t       P>|t|   [0.025     0.975]  


  Intercept      177.4143     34.088      5.205   0.000    110.551    244.277


  Prod_Budget   2.009e-06    1.1e-07     18.294   0.000   1.79e-06   2.22e-06


  Runtime         -0.4751      0.323     -1.472   0.141     -1.108      0.158




  Omnibus:        535.759    Durbin-Watson:         2.041


  Prob(Omnibus):   0.000     Jarque-Bera (JB):   2068.568


  Skew:            1.629     Prob(JB):               0.00


  Kurtosis:        7.583     Cond. No.           4.79e+08



In [18]:

    
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(x, y)









    Out[18]:





LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)



In [19]:

    
model.score(x, y)









    Out[19]:





0.18036536773284317



In [20]:

    
model.coef_









    Out[20]:





array([[  0.00000000e+00,   7.23428364e+00,   4.32658345e+00,
         -1.15608671e+01,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   2.00814630e-06]])



In [21]:

    
df_sub['log_tor']=np.log(df_sub.Total_Torrents)
df_sub['log_budg']=np.log(df_sub.Prod_Budget)

trans=df_sub[['log_tor','log_budg', 'Runtime']]
_ = pd.tools.plotting.scatter_matrix(trans)









    



/Users/bryant/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:5: FutureWarning: 'pandas.tools.plotting.scatter_matrix' is deprecated, import 'pandas.plotting.scatter_matrix' instead.

Log Transform



In [22]:

    
y, x = dmatrices('log_tor ~ log_budg + Runtime_Bin', data=df_sub, return_type='dataframe')
x.head()









    Out[22]:







  
    
      
      Intercept
      Runtime_Bin[T.80-100]
      Runtime_Bin[T.100-120]
      Runtime_Bin[T.120-140]
      Runtime_Bin[T.140-160]
      Runtime_Bin[T.160-180]
      Runtime_Bin[T.180-200]
      Runtime_Bin[T.200+]
      log_budg
    
  
  
    
      1662
      1.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      11.512925
    
    
      1265
      1.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      16.705882
    
    
      961
      1.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      17.216708
    
    
      375
      1.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      17.989898
    
    
      1071
      1.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      17.034386



In [23]:

    
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(x, y)

model.score(x,y)









    Out[23]:





0.073126521575192061



In [24]:

    
_ = plt.plot(y, model.predict(x), 'ro')



In [25]:

    
from sklearn import cross_validation as cv
from sklearn import metrics

x_train, x_test, y_train, y_test = cv.train_test_split(x,y,test_size=0.333,random_state=1234)

model = LinearRegression().fit(x_train, y_train)









    



/Users/bryant/anaconda/lib/python3.6/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)



In [26]:

    
metrics.mean_squared_error(y_train,model.predict(x_train))









    Out[26]:





0.6256504211661591



In [27]:

    
model.score(x_train, y_train)









    Out[27]:





0.080732417488071695



In [28]:

    
metrics.mean_squared_error(y_test,model.predict(x_test))









    Out[28]:





0.67129015118529678



In [29]:

    
# reset x, y otherwise errors occur
y, x = dmatrices('log_tor ~ log_budg + Runtime_Bin', data=df_sub, return_type='dataframe')

from sklearn.cross_validation import KFold
kf = KFold(len(df_sub), n_folds=100, shuffle=True)

for train_index, test_index in kf:
    x_train, x_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    clf2 = LinearRegression().fit(x.iloc[train_index], y.iloc[train_index])

clf2.score(x,y)









    Out[29]:





0.072317789136014943



In [ ]:

	Prod_Budget	Runtime	Total_Torrents
count	1.664000e+03	1664.000000	1664.000000
mean	4.773100e+07	110.026442	223.215745
std	4.429420e+07	18.226754	196.328069
min	6.800000e+04	72.000000	17.000000
25%	1.850000e+07	97.000000	89.000000
50%	3.500000e+07	107.000000	158.000000
75%	6.000000e+07	119.000000	298.000000
max	4.250000e+08	229.000000	1450.000000

	Prod_Budget	Runtime	Total_Torrents
count	1.570000e+03	1570.000000	1570.000000
mean	4.529649e+07	107.492994	217.320382
std	4.070365e+07	13.846661	189.354042
min	6.800000e+04	81.000000	17.000000
25%	1.800000e+07	97.000000	85.250000
50%	3.500000e+07	106.000000	153.000000
75%	6.000000e+07	117.000000	291.000000
max	2.750000e+08	140.000000	1291.000000

	Intercept	Runtime_Bin[T.80-100]	Prod_Budget
1662	1.0	1.0	100000.0
1265	1.0	1.0	18000000.0
961	1.0	1.0	30000000.0
375	1.0	1.0	65000000.0
1071	1.0	1.0	25000000.0

Dep. Variable:	Total_Torrents	R-squared:	0.180
Model:	OLS	Adj. R-squared:	0.179
Method:	Least Squares	F-statistic:	114.9
Date:	Sun, 13 Aug 2017	Prob (F-statistic):	3.12e-67
Time:	15:23:53	Log-Likelihood:	-10304.
No. Observations:	1570	AIC:	2.062e+04
Df Residuals:	1566	BIC:	2.064e+04
Df Model:	3
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	93.1120	5.239	17.774	0.000	82.836	103.388
Runtime_Bin[T.80-100]	38.2716	6.238	6.135	0.000	26.035	50.508
Runtime_Bin[T.100-120]	35.3639	5.770	6.129	0.000	24.047	46.681
Runtime_Bin[T.120-140]	19.4765	8.276	2.353	0.019	3.244	35.709
Runtime_Bin[T.140-160]	0	0	nan	nan	0	0
Runtime_Bin[T.160-180]	0	0	nan	nan	0	0
Runtime_Bin[T.180-200]	0	0	nan	nan	0	0
Runtime_Bin[T.200+]	0	0	nan	nan	0	0
Prod_Budget	2.008e-06	1.1e-07	18.338	0.000	1.79e-06	2.22e-06

Omnibus:	534.136	Durbin-Watson:	2.041
Prob(Omnibus):	0.000	Jarque-Bera (JB):	2052.114
Skew:	1.625	Prob(JB):	0.00
Kurtosis:	7.561	Cond. No.	inf

Omnibus:	535.759	Durbin-Watson:	2.041
Prob(Omnibus):	0.000	Jarque-Bera (JB):	2068.568
Skew:	1.629	Prob(JB):	0.00
Kurtosis:	7.583	Cond. No.	4.79e+08

	Intercept	Runtime_Bin[T.80-100]	log_budg
1662	1.0	1.0	11.512925
1265	1.0	1.0	16.705882
961	1.0	1.0	17.216708
375	1.0	1.0	17.989898
1071	1.0	1.0	17.034386