In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

In [2]:
df = pd.read_csv('EDA.csv', index_col=0)
df.head()


Out[2]:
imdbID Country_Canada Country_France Country_Germany Country_Other Country_UK Country_USA DVD_Fall DVD_Month DVD_Spring ... Release_Winter Release_Year Runtime Runtime_Log Torrentz_Count Torrentz_Ver_Count Total_Torrents WorldGross WorldGross_Log Zoogle_Ver_Count
0 tt0499549 0 0 0 0 1 1 0 4.0 0 ... 0 2009 162.0 5.087596 961 21 2120.0 2783918982 21.747125 118.0
1 tt0376994 1 0 0 0 1 1 1 10.0 0 ... 0 2006 104.0 4.644391 226 11 516.0 459260946 19.945129 53.0
2 tt1877832 1 0 0 0 1 1 1 10.0 0 ... 0 2014 132.0 4.882802 905 95 2092.0 747862775 20.432730 113.0
3 tt3385516 0 0 0 0 0 1 1 10.0 0 ... 0 2016 144.0 4.969813 302 0 674.0 542742489 20.112146 130.0
4 tt2103281 1 0 0 0 1 1 1 12.0 0 ... 1 2014 130.0 4.867534 524 46 1479.0 710644566 20.381683 66.0

5 rows × 63 columns


In [3]:
drop_cols = ['Extra_Count','Kat_Count','Pirate_Count','Torrentz_Count',
            'Torrentz_Ver_Count','Zoogle_Ver_Count','Runtime',
            'imdbID','DVD_Month', 'DomesticBudget','ProductionBudget','WorldGross']

In [4]:
[x for x in df.columns if 'Budget' in x or 'Gross' in x]


Out[4]:
['DomesticBudget',
 'DomesticBudget_Log',
 'ProductionBudget',
 'ProductionBudget_Log',
 'WorldGross',
 'WorldGross_Log']

In [ ]:


In [5]:
from sklearn.model_selection import train_test_split

y = df['Total_Torrents']
df.drop('Total_Torrents', axis=1, inplace=True)
X = df.drop(drop_cols, axis=1, inplace=False)

# Scale
min_max_scaler = preprocessing.MinMaxScaler()
X_minmax = min_max_scaler.fit_transform(X)

# Split
X_train, X_test, y_train, y_test = train_test_split(X_minmax, y, test_size=0.33)

In [6]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import ElasticNet, LinearRegression

mdl = SelectFromModel(LinearRegression())

mdl.fit(X_train, y_train)


Out[6]:
SelectFromModel(estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False),
        norm_order=1, prefit=False, threshold=None)

In [7]:
feats = [X.columns[i] for i in mdl.get_support(indices=True)]

In [8]:
feats


Out[8]:
['Language_Other',
 'Prod_Size_Large',
 'Prod_Size_Medium',
 'Prod_Size_Small',
 'Prod_Size_Tiny',
 'Prod_Size_Xtreme',
 'Rated_G',
 'Rated_NR',
 'Rated_PG',
 'Rated_PG13',
 'Rated_R',
 'Release_Fall',
 'Release_Spring',
 'Release_Summer',
 'Release_Winter']

In [9]:
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [10]:
params = {'fit_intercept': [True, False]}

clf = GridSearchCV(LinearRegression(), params, cv=5, scoring='r2', n_jobs=4)
clf.fit(X_train, y_train)


Out[10]:
GridSearchCV(cv=5, error_score='raise',
       estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'fit_intercept': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='r2', verbose=0)
params = {'alpha': [10, 1, 0.5, 0.1, 0.01], 'l1_ratio': [1, 0.5, 0.25, 0.1], 'max_iter': [10000,100000], 'tol': [0.001,0.0001,0.00001], 'fit_intercept': [True, False]} clf = RandomizedSearchCV(ElasticNet(), params, cv=5, scoring='r2', n_jobs=4) clf.fit(X_train, y_train)

In [11]:
clf.best_estimator_


Out[11]:
LinearRegression(copy_X=True, fit_intercept=False, n_jobs=1, normalize=False)

In [ ]: