conda install -c conda-forge dask-ml in your terminalconda install dask-searchcv -c conda-forge in your terminal
In [47]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
In [49]:
train = pd.read_csv("Big_Mart_Train.csv")
test = pd.read_csv("Big_Mart_Test.csv")
In [50]:
train.head()
Out[50]:
In [51]:
train.isnull().sum()
Out[51]:
In [52]:
test.isnull().sum()
Out[52]:
In [53]:
# fill NA with median
train.Item_Weight = train.Item_Weight.fillna(np.nanmedian(train.Item_Weight))
test.Item_Weight = test.Item_Weight.fillna(np.nanmedian(test.Item_Weight))
In [54]:
print train.Outlet_Size.unique()
print test.Outlet_Size.unique()
In [22]:
# fill NA with mode
train.Outlet_Size = train.Outlet_Size.fillna(train.Outlet_Size.mode().iloc[0])
test.Outlet_Size = test.Outlet_Size.fillna(train.Outlet_Size.mode().iloc[0])
In [24]:
train.dtypes
Out[24]:
In [55]:
print train.Item_Fat_Content.unique()
print test.Item_Fat_Content.unique()
print train.Item_Type.unique()
print test.Item_Type.unique()
print train.Outlet_Identifier.unique()
print test.Outlet_Identifier.unique()
print train.Outlet_Size.unique()
print test.Outlet_Size.unique()
print train.Outlet_Location_Type.unique()
print test.Outlet_Location_Type.unique()
print train.Outlet_Type.unique()
print test.Outlet_Type.unique()
In [56]:
train.Item_Fat_Content = train.Item_Fat_Content.replace(['low fat', 'LF'], ['Low Fat', 'Low Fat'])
test.Item_Fat_Content = test.Item_Fat_Content.replace(['low fat', 'LF'], ['Low Fat', 'Low Fat'])
train.Item_Fat_Content = train.Item_Fat_Content.replace(['reg'], ['Regular'])
test.Item_Fat_Content = test.Item_Fat_Content.replace(['reg'], ['Regular'])
In [57]:
print train.Item_Fat_Content.unique()
print test.Item_Fat_Content.unique()
In [58]:
print train.Outlet_Establishment_Year.max()
print train.Outlet_Establishment_Year.min()
In [77]:
# label encoding, do this by combining train and test together
test['Item_Outlet_Sales'] = 0
combi = train.append(test)
number = LabelEncoder()
for i in combi.columns:
if (combi[i].dtype == 'object'):
combi[i] = number.fit_transform(combi[i].astype('str'))
combi[i] = combi[i].astype('object')
train = combi[:train.shape[0]]
test = combi[train.shape[0]:]
In [78]:
train.head()
Out[78]:
In [79]:
# Convert pandas dataframe to dask
dask_train = dd.from_pandas(train, npartitions=3)
dask_test = dd.from_pandas(test, npartitions=3)
In [80]:
dask_test.head()
Out[80]:
In [82]:
dask_test = dask_test.drop('Item_Outlet_Sales', axis=1)
# remove id and those with more levels
dask_train = dask_train.drop('Item_Identifier', axis=1)
dask_test = dask_test.drop('Item_Identifier', axis=1)
target = dask_train['Item_Outlet_Sales']
dask_train = dask_train.drop('Item_Outlet_Sales', axis=1)
In [88]:
from dask_ml.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dask_train, target,
train_size=0.75, test_size=0.25, random_state=410)
In [90]:
import dask_ml.joblib
from sklearn.externals.joblib import parallel_backend
from dask.distributed import Client, progress
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
import dask_searchcv as dcv
import warnings
warnings.filterwarnings("ignore")
client = Client(processes=False, threads_per_worker=4, n_workers=2, memory_limit='2GB')
In [99]:
param_grid = {
'bootstrap': [True],
'max_depth': [8, 9],
'max_features': [2, 3],
'min_samples_leaf': [4, 5],
'min_samples_split': [8, 10],
'n_estimators': [100, 200]
}
rf = RandomForestRegressor()
In [ ]:
# Grid Search
grid_search = dcv.GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3)
with parallel_backend('dask', scatter=[X_train, y_train]):
grid_search.fit(X_train, y_train)
In [106]:
grid_search.best_params_
Out[106]:
In [100]:
grid_search.score(X_test, y_test)
Out[100]:
In [101]:
grid_search.predict(dask_test)
Out[101]:
In [103]:
# Random Search
rand_search = dcv.RandomizedSearchCV(estimator = rf, param_distributions = param_grid, cv = 3)
with parallel_backend('dask', scatter=[X_train, y_train]):
rand_search.fit(X_train, y_train)
In [107]:
rand_search.best_params_
Out[107]:
In [104]:
rand_search.score(X_test, y_test)
Out[104]:
In [105]:
rand_search.predict(dask_test)
Out[105]: