In [1]:

    
import pandas as pd
import numpy as np
from sklearn import preprocessing



In [2]:

    
df = pd.read_csv('EDA.csv', index_col=0)
df.head()









    Out[2]:







  
    
      
      imdbID
      Country_Canada
      Country_France
      Country_Germany
      Country_Other
      Country_UK
      Country_USA
      DVD_Fall
      DVD_Month
      DVD_Spring
      ...
      Release_Winter
      Release_Year
      Runtime
      Runtime_Log
      Torrentz_Count
      Torrentz_Ver_Count
      Total_Torrents
      WorldGross
      WorldGross_Log
      Zoogle_Ver_Count
    
  
  
    
      0
      tt0499549
      0
      0
      0
      0
      1
      1
      0
      4.0
      0
      ...
      0
      2009
      162.0
      5.087596
      961
      21
      2120.0
      2783918982
      21.747125
      118.0
    
    
      1
      tt0376994
      1
      0
      0
      0
      1
      1
      1
      10.0
      0
      ...
      0
      2006
      104.0
      4.644391
      226
      11
      516.0
      459260946
      19.945129
      53.0
    
    
      2
      tt1877832
      1
      0
      0
      0
      1
      1
      1
      10.0
      0
      ...
      0
      2014
      132.0
      4.882802
      905
      95
      2092.0
      747862775
      20.432730
      113.0
    
    
      3
      tt3385516
      0
      0
      0
      0
      0
      1
      1
      10.0
      0
      ...
      0
      2016
      144.0
      4.969813
      302
      0
      674.0
      542742489
      20.112146
      130.0
    
    
      4
      tt2103281
      1
      0
      0
      0
      1
      1
      1
      12.0
      0
      ...
      1
      2014
      130.0
      4.867534
      524
      46
      1479.0
      710644566
      20.381683
      66.0
    
  

5 rows × 63 columns



In [3]:

    
drop_cols = ['Extra_Count','Kat_Count','Pirate_Count','Torrentz_Count',
            'Torrentz_Ver_Count','Zoogle_Ver_Count','Runtime',
            'imdbID','DVD_Month', 'DomesticBudget','ProductionBudget','WorldGross']



In [4]:

    
[x for x in df.columns if 'Budget' in x or 'Gross' in x]









    Out[4]:





['DomesticBudget',
 'DomesticBudget_Log',
 'ProductionBudget',
 'ProductionBudget_Log',
 'WorldGross',
 'WorldGross_Log']



In [ ]:



In [5]:

    
from sklearn.model_selection import train_test_split

y = df['Total_Torrents']
df.drop('Total_Torrents', axis=1, inplace=True)
X = df.drop(drop_cols, axis=1, inplace=False)

# Scale
min_max_scaler = preprocessing.MinMaxScaler()
X_minmax = min_max_scaler.fit_transform(X)

# Split
X_train, X_test, y_train, y_test = train_test_split(X_minmax, y, test_size=0.33)



In [6]:

    
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import ElasticNet, LinearRegression

mdl = SelectFromModel(LinearRegression())

mdl.fit(X_train, y_train)









    Out[6]:





SelectFromModel(estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False),
        norm_order=1, prefit=False, threshold=None)



In [7]:

    
feats = [X.columns[i] for i in mdl.get_support(indices=True)]



In [8]:

    
feats









    Out[8]:





['Language_Other',
 'Prod_Size_Large',
 'Prod_Size_Medium',
 'Prod_Size_Small',
 'Prod_Size_Tiny',
 'Prod_Size_Xtreme',
 'Rated_G',
 'Rated_NR',
 'Rated_PG',
 'Rated_PG13',
 'Rated_R',
 'Release_Fall',
 'Release_Spring',
 'Release_Summer',
 'Release_Winter']



In [9]:

    
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV



In [10]:

    
params = {'fit_intercept': [True, False]}

clf = GridSearchCV(LinearRegression(), params, cv=5, scoring='r2', n_jobs=4)
clf.fit(X_train, y_train)









    Out[10]:





GridSearchCV(cv=5, error_score='raise',
       estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'fit_intercept': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='r2', verbose=0)

params = {'alpha': [10, 1, 0.5, 0.1, 0.01], 'l1_ratio': [1, 0.5, 0.25, 0.1], 'max_iter': [10000,100000], 'tol': [0.001,0.0001,0.00001], 'fit_intercept': [True, False]} clf = RandomizedSearchCV(ElasticNet(), params, cv=5, scoring='r2', n_jobs=4) clf.fit(X_train, y_train)



In [11]:

    
clf.best_estimator_









    Out[11]:





LinearRegression(copy_X=True, fit_intercept=False, n_jobs=1, normalize=False)



In [ ]:

	imdbID	Country_Canada	Country_UK	Country_USA	DVD_Fall	DVD_Month	...	Release_Winter	Release_Year	Runtime	Runtime_Log	Torrentz_Count	Torrentz_Ver_Count	Total_Torrents	WorldGross	WorldGross_Log	Zoogle_Ver_Count
0	tt0499549	0	1	1	0	4.0	...	0	2009	162.0	5.087596	961	21	2120.0	2783918982	21.747125	118.0
1	tt0376994	1	1	1	1	10.0	...	0	2006	104.0	4.644391	226	11	516.0	459260946	19.945129	53.0
2	tt1877832	1	1	1	1	10.0	...	0	2014	132.0	4.882802	905	95	2092.0	747862775	20.432730	113.0
3	tt3385516	0	0	1	1	10.0	...	0	2016	144.0	4.969813	302	0	674.0	542742489	20.112146	130.0
4	tt2103281	1	1	1	1	12.0	...	1	2014	130.0	4.867534	524	46	1479.0	710644566	20.381683	66.0