In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
In [2]:
df = pd.read_csv('EDA.csv', index_col=0)
df.head()
Out[2]:
In [3]:
drop_cols = ['Extra_Count','Kat_Count','Pirate_Count','Torrentz_Count',
'Torrentz_Ver_Count','Zoogle_Ver_Count','Runtime',
'imdbID','DVD_Month', 'DomesticBudget','ProductionBudget','WorldGross']
In [4]:
[x for x in df.columns if 'Budget' in x or 'Gross' in x]
Out[4]:
In [ ]:
In [5]:
from sklearn.model_selection import train_test_split
y = df['Total_Torrents']
df.drop('Total_Torrents', axis=1, inplace=True)
X = df.drop(drop_cols, axis=1, inplace=False)
# Scale
min_max_scaler = preprocessing.MinMaxScaler()
X_minmax = min_max_scaler.fit_transform(X)
# Split
X_train, X_test, y_train, y_test = train_test_split(X_minmax, y, test_size=0.33)
In [6]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import ElasticNet, LinearRegression
mdl = SelectFromModel(LinearRegression())
mdl.fit(X_train, y_train)
Out[6]:
In [7]:
feats = [X.columns[i] for i in mdl.get_support(indices=True)]
In [8]:
feats
Out[8]:
In [9]:
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
In [10]:
params = {'fit_intercept': [True, False]}
clf = GridSearchCV(LinearRegression(), params, cv=5, scoring='r2', n_jobs=4)
clf.fit(X_train, y_train)
Out[10]:
In [11]:
clf.best_estimator_
Out[11]:
In [ ]: