In [1]:
import os
import sys
if os.environ['PROJECTDIR'] not in sys.path:
sys.path.insert(1, os.environ['PROJECTDIR'])
import prospecting as p
In [2]:
from sklearn.preprocessing import StandardScaler # normal distribution
from sklearn.preprocessing import MinMaxScaler # scales data to [0, 1]
from sklearn.preprocessing import MaxAbsScaler # scales data to [-1, 1]; for data already centered at zero, or sparse data
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.dummy import DummyClassifier
In [3]:
# Initialize model session, pass reference to pickled dataset
dataset_path = os.path.join(p.DATADIR, 'innocentive_dummified_96406_308.p')
m = p.ModelSession(dataset_path, testsize=0.2, pcasets=False)
In [4]:
# Initialize SheetsApi instance as attribute of ModelSession instance
m.ss = p.SheetsApi(spreadsheetid = '1dG5lQfqthqshz45Rs94VLSSWmSrS60b1iw7cT4Rqevs',
scopelist = ['https://www.googleapis.com/auth/spreadsheets',
'https://www.googleapis.com/auth/drive.metadata'])
m.ss.authenticate()
m.ss.info = m.ss.get_ss_info()
m.ss.sheets = m.ss.load_sheets(['session_report', 'cv_results', 'model_types'])
#Lookup names of model_types to use when reporting on model performance
m.lookup_model_types = dict(m.ss.sheets['model_types'].to_dict(orient='split')['data'])
In [5]:
# model_config (list): List of config dictionaries to pass to grid_search; config dictionaries are comprised of:
# pipesteps (list): List of tuples representing pipeline
# params (dict): Dictionary of pipeline params to use in GridSearchCV
# scoring (str): Scoring type to use in GridSearchCV
# datasets (list): List of strings identifying datasets to use, datasets must be attributes of ModelSession
# n_jobs (int): Number of cores to use during GridSearchCV, default=3 (optional)
model_config = [{'pipesteps': [('stdsc', StandardScaler()), ('lr', LogisticRegression())],
'params': {'lr__C':[0.25],
'lr__penalty':['l1', 'l2'],
'lr__class_weight':[None, 'balanced']},
'scoring': 'roc_auc',
'datasets': ['X_train']},
{'pipesteps': [('mmsc', MinMaxScaler()), ('lr', LogisticRegression())],
'params': {'lr__C':[0.5],
'lr__penalty':['l1', 'l2'],
'lr__class_weight':[None, 'balanced']},
'scoring': 'roc_auc',
'datasets': ['X_train']},
{'pipesteps': [('masc', MaxAbsScaler()), ('lr', LogisticRegression())],
'params': {'lr__C':[0.01, 0.1, 0.5, 1],
'lr__penalty':['l1', 'l2'],
'lr__class_weight':[None, 'balanced']},
'scoring': 'roc_auc',
'datasets': ['X_train']},
{'pipesteps': [('rfc', RandomForestClassifier())],
'params': {'rfc__n_estimators':[250, 300],
'rfc__max_depth':[19, 25]},
'scoring': 'roc_auc',
'datasets': ['X_train']}
]
In [6]:
len(model_config)
Out[6]:
In [7]:
p.model.grid_search(m, model_config)
In [ ]:
# To create PCA plots, exclude pcasets=False from m = p.ModelSession(dataset_path, testsize=0.2)
plots_dir = 'C:\\Users\\Reid\\Google Drive\\projects\\innocentive\\plots'
p.report.plot_pca_expvar_to_pdf(m, plots_dir)