In [1]:
import pandas as pd
import numpy as np
import string
from collections import defaultdict
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
matplotlib.style.use('ggplot')
In [2]:
df = pd.read_csv('data/data.csv', encoding='latin-1')
df = df[['Prod_Budget', 'Runtime', 'Total_Torrents']]
print(len(df))
print(df.head())
In [3]:
df['Runtime'].describe()
Out[3]:
In [4]:
buckets = [0, 80, 100, 120, 140, 160, 180, 200, 300]
labels = ['0-80', '80-100', '100-120', '120-140', '140-160', '160-180', '180-200', '200+']
df['Runtime_Bin'] = pd.cut(df['Runtime'], buckets, labels=labels)
df = df.sort_values(['Runtime'], ascending=True)
print(df['Runtime_Bin'].value_counts())
In [5]:
# dictionary - runtime bucket counts
duration = df['Runtime_Bin'].value_counts().to_dict()
import operator
srt = sorted(duration.items(), key=operator.itemgetter(1))
srt = srt[::-1]
print(srt)
In [6]:
plt.figure(figsize=(25,10))
ind = np.arange(len(duration)) # the x locations for the groups
width = 0.35 # the width of the bars
bar_bin = [n_bin for n_bin, count in srt]
bar_count = [count for n_bin, count in srt]
plt.bar(ind, bar_count, width, color='r')
plt.ylabel('Count')
plt.xlabel('Duration Bin')
plt.title('Number of Movie Duration per Binned Time')
plt.xticks(ind + width/2., (bar_bin), rotation='vertical')
plt.yticks(np.arange(0, 801, 100))
plt.show()
In [7]:
term = '100-120'
df_a = df.loc[df['Runtime_Bin'] == term]
print(len(df_a))
plt.figure(figsize=(15,10))
plt.axis([0, 450000000, 0, 1500])
plt.xlabel('Production Budget')
plt.ylabel('Torrents')
plt.title('Torrents vs. Production Budget - {0}'.format(term))
_ = plt.scatter(df_a['Prod_Budget'], df_a['Total_Torrents'], alpha=0.5, s=50)
In [8]:
term = '80-100'
df_a = df.loc[df['Runtime_Bin'] == term]
print(len(df_a))
plt.figure(figsize=(15,10))
plt.axis([0, 450000000, 0, 1500])
plt.xlabel('Production Budget')
plt.ylabel('Torrents')
plt.title('Torrents vs. Production Budget - {0}'.format(term))
_ = plt.scatter(df_a['Prod_Budget'], df_a['Total_Torrents'], alpha=0.5, s=50)
In [9]:
term = '120-140'
df_a = df.loc[df['Runtime_Bin'] == term]
print(len(df_a))
plt.figure(figsize=(15,10))
plt.axis([0, 450000000, 0, 1500])
plt.xlabel('Production Budget')
plt.ylabel('Torrents')
plt.title('Torrents vs. Production Budget - {0}'.format(term))
_ = plt.scatter(df_a['Prod_Budget'], df_a['Total_Torrents'], alpha=0.5, s=50)
In [10]:
df_sub = df[['Prod_Budget', 'Runtime', 'Runtime_Bin', 'Total_Torrents']]
df_sub.describe()
Out[10]:
In [11]:
# Lets take a look at our entire model
plt.rcParams['figure.figsize'] = (12, 12)
_ = pd.tools.plotting.scatter_matrix(df_sub)
In [12]:
# select only runtime bins of significance
duration = ['80-100', '100-120', '120-140']
df_sub = df_sub.loc[df_sub['Runtime_Bin'].isin(duration)]
df_sub.describe()
Out[12]:
In [13]:
from patsy import dmatrices
y, x = dmatrices('Total_Torrents ~ Prod_Budget + Runtime_Bin', data=df_sub, return_type='dataframe')
In [14]:
y.head()
Out[14]:
In [15]:
x.head()
Out[15]:
In [16]:
import statsmodels.api as sm
model = sm.OLS(y, x)
results = model.fit()
results.summary()
Out[16]:
In [17]:
import statsmodels.formula.api as smf
results = smf.ols(formula='Total_Torrents ~ Prod_Budget + Runtime', data=df_sub,).fit()
results.summary()
Out[17]:
In [18]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x, y)
Out[18]:
In [19]:
model.score(x, y)
Out[19]:
In [20]:
model.coef_
Out[20]:
In [21]:
df_sub['log_tor']=np.log(df_sub.Total_Torrents)
df_sub['log_budg']=np.log(df_sub.Prod_Budget)
trans=df_sub[['log_tor','log_budg', 'Runtime']]
_ = pd.tools.plotting.scatter_matrix(trans)
In [22]:
y, x = dmatrices('log_tor ~ log_budg + Runtime_Bin', data=df_sub, return_type='dataframe')
x.head()
Out[22]:
In [23]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x, y)
model.score(x,y)
Out[23]:
In [24]:
_ = plt.plot(y, model.predict(x), 'ro')
In [25]:
from sklearn import cross_validation as cv
from sklearn import metrics
x_train, x_test, y_train, y_test = cv.train_test_split(x,y,test_size=0.333,random_state=1234)
model = LinearRegression().fit(x_train, y_train)
In [26]:
metrics.mean_squared_error(y_train,model.predict(x_train))
Out[26]:
In [27]:
model.score(x_train, y_train)
Out[27]:
In [28]:
metrics.mean_squared_error(y_test,model.predict(x_test))
Out[28]:
In [29]:
# reset x, y otherwise errors occur
y, x = dmatrices('log_tor ~ log_budg + Runtime_Bin', data=df_sub, return_type='dataframe')
from sklearn.cross_validation import KFold
kf = KFold(len(df_sub), n_folds=100, shuffle=True)
for train_index, test_index in kf:
x_train, x_test = x.iloc[train_index], x.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
clf2 = LinearRegression().fit(x.iloc[train_index], y.iloc[train_index])
clf2.score(x,y)
Out[29]:
In [ ]: