notebook.community

Edit and run



In [186]:

    
# Import the necessary modules and libraries
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt



In [202]:

    
# Load the dataset
df = pd.read_csv("../Data/data_for_lr.csv")
df = df.drop(["Unnamed: 0","country","year","TV-14","TV-MA","TV-PG"],1)
X = df.drop("revenue",1)
y = df["revenue"]









    Out[202]:






  
    
      
      imdb_rating
      imdb_votes
      language
      production
      released_on_weekend
      released_on_dump_month
      runtime
      budget
      actor_popularity
      director_popularity
      ...
      mystery
      romance
      thriller
      other_genre
      G
      NC-17
      PG
      PG-13
      R
      UNRATED
    
  
  
    
      0
      7.3
      11.976842
      2
      102
      1
      0
      111
      16.811243
      21.727389
      2.029155
      ...
      0
      1
      0
      0
      0
      0
      0
      0
      1
      0
    
    
      1
      7.9
      12.884159
      9
      163
      1
      1
      109
      18.146244
      13.835096
      2.089790
      ...
      0
      0
      1
      0
      0
      0
      0
      0
      1
      0
    
    
      2
      6.5
      11.101840
      2
      21
      1
      0
      84
      18.269858
      16.142013
      1.000927
      ...
      1
      0
      1
      0
      0
      0
      0
      0
      1
      0
    
    
      3
      1.7
      8.754634
      1
      102
      0
      1
      91
      17.989898
      13.566246
      1.000000
      ...
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
    
    
      4
      5.2
      12.177020
      1
      210
      1
      1
      114
      18.515991
      7.153206
      2.828585
      ...
      0
      0
      1
      0
      0
      0
      0
      1
      0
      0
    
  

5 rows × 29 columns



In [203]:

    
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3)



In [204]:

    
# Fit regression model
regr_1 = DecisionTreeRegressor(max_depth=2)
regr_2 = DecisionTreeRegressor(max_depth=5)
regresser1 = regr_1.fit(x_train, y_train)
regresser2 = regr_2.fit(x_train, y_train)



In [205]:

    
# Predict
y_1 = regr_1.predict(x_test)
y_2 = regr_2.predict(x_test)
#zz = np.reshape(y_1, (712))
mse1 = np.average((y_1-np.array(y_test))**2)
mse2 = np.average((y_2-np.array(y_test))**2)
print (mse1, mse2)









    



3.05564074588 2.25312587554



In [206]:

    
#regr_1.decision_path(x_test)
tree.export_graphviz(regresser1, out_file='../Data/tree1.dot') 
tree.export_graphviz(regresser2, out_file='../Data/tree2.dot')



In [207]:

    
regr_1.feature_importances_









    Out[207]:





array([ 0.        ,  0.15154685,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.84845315,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ])



In [208]:

    
regr_2.feature_importances_









    Out[208]:





array([ 0.00187272,  0.22337238,  0.        ,  0.        ,  0.00091876,
        0.        ,  0.01718695,  0.71008581,  0.00940428,  0.        ,
        0.        ,  0.        ,  0.00371059,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.03344851,  0.        ])



In [209]:

    
regr_1.score(x_train, y_train)









    Out[209]:





0.49311635532312942



In [211]:

    
regr_2.score(x_train, y_train)









    Out[211]:





0.67110717235553219



In [212]:

    
# Plot the results
plt.figure()
plt.scatter(x_train['imdb_votes'], y_train, s=20, edgecolor="black",
            c="darkorange", label="data")
plt.plot(x_test['imdb_votes'], y_1, color="cornflowerblue",
         label="max_depth=2", linewidth=2)
plt.plot(x_test['imdb_votes'], y_2, color="yellowgreen", label="max_depth=5", linewidth=2)
plt.xlabel("data")
plt.ylabel("target")
plt.title("Decision Tree Regression")
plt.legend()
plt.show()



In [70]:

    
from matplotlib import pyplot as plt
%matplotlib inline



In [220]:

    
regr_1 = DecisionTreeRegressor(max_depth=2)
regr_1.fit(x_train[["imdb_votes","budget"]],y_train)
nx = 30
ny = 30
# creating a grid of points
x_budget = np.linspace(0, 20, nx) 
y_votes = np.linspace(0, 20, ny)
xx, yy = np.meshgrid(x_budget, y_votes)
# evaluating the regressor on all the points
z = regr_1.predict(np.array([xx.flatten(), yy.flatten()]).T)
zz = np.reshape(z, (nx, ny))



In [222]:

    
fig = plt.figure(figsize=(8, 8))
# plotting the predictions
plt.pcolormesh(x_test['budget'], x_test['imdb_votes'], zz, cmap=plt.cm.YlOrRd)
plt.colorbar(label='revenue predicted') # add a colorbar on the right
# plotting also the observations
plt.scatter(x_train['budget'], x_train['imdb_votes'], s=y_train/2.50, c='g')
# setting the limit for each axis
plt.xlim(np.min(x_test['budget']), np.max(x_test['budget']))
plt.ylim(np.min(x_test['imdb_votes']), np.max(x_test['imdb_votes']))
plt.xlabel('budget')
plt.ylabel('imdb_votes')
plt.show()









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-222-e8a21cd5d28b> in <module>()
      1 fig = plt.figure(figsize=(8, 8))
      2 # plotting the predictions
----> 3 plt.pcolormesh(x_test['budget'], x_test['imdb_votes'], zz, cmap=plt.cm.YlOrRd)
      4 plt.colorbar(label='revenue predicted') # add a colorbar on the right
      5 # plotting also the observations

/Users/Junmeng/anaconda/lib/python3.5/site-packages/matplotlib/pyplot.py in pcolormesh(*args, **kwargs)
   3098         ax.hold(hold)
   3099     try:
-> 3100         ret = ax.pcolormesh(*args, **kwargs)
   3101     finally:
   3102         ax.hold(washold)

/Users/Junmeng/anaconda/lib/python3.5/site-packages/matplotlib/__init__.py in inner(ax, *args, **kwargs)
   1817                     warnings.warn(msg % (label_namer, func.__name__),
   1818                                   RuntimeWarning, stacklevel=2)
-> 1819             return func(ax, *args, **kwargs)
   1820         pre_doc = inner.__doc__
   1821         if pre_doc is None:

/Users/Junmeng/anaconda/lib/python3.5/site-packages/matplotlib/axes/_axes.py in pcolormesh(self, *args, **kwargs)
   5368         allmatch = (shading == 'gouraud')
   5369 
-> 5370         X, Y, C = self._pcolorargs('pcolormesh', *args, allmatch=allmatch)
   5371         Ny, Nx = X.shape
   5372 

/Users/Junmeng/anaconda/lib/python3.5/site-packages/matplotlib/axes/_axes.py in _pcolorargs(funcname, *args, **kw)
   4994                 raise TypeError('Dimensions of C %s are incompatible with'
   4995                                 ' X (%d) and/or Y (%d); see help(%s)' % (
-> 4996                                     C.shape, Nx, Ny, funcname))
   4997             C = C[:Ny - 1, :Nx - 1]
   4998         return X, Y, C

TypeError: Dimensions of C (30, 30) are incompatible with X (712) and/or Y (712); see help(pcolormesh)



In [ ]:

	imdb_rating	imdb_votes	language	production	released_on_weekend	released_on_dump_month	runtime	budget	actor_popularity	director_popularity	...	mystery	romance	thriller	PG	PG-13	R
0	7.3	11.976842	2	102	1	0	111	16.811243	21.727389	2.029155	...	0	1	0	0	0	1
1	7.9	12.884159	9	163	1	1	109	18.146244	13.835096	2.089790	...	0	0	1	0	0	1
2	6.5	11.101840	2	21	1	0	84	18.269858	16.142013	1.000927	...	1	0	1	0	0	1
3	1.7	8.754634	1	102	0	1	91	17.989898	13.566246	1.000000	...	0	0	0	1	0	0
4	5.2	12.177020	1	210	1	1	114	18.515991	7.153206	2.828585	...	0	0	1	0	1	0