notebook.community

Edit and run



In [1]:

    
import os
import sys

if os.environ['PROJECTDIR'] not in sys.path:
    sys.path.insert(1, os.environ['PROJECTDIR'])
import prospecting as p



In [2]:

    
from sklearn.preprocessing import StandardScaler # normal distribution
from sklearn.preprocessing import MinMaxScaler # scales data to [0, 1]
from sklearn.preprocessing import MaxAbsScaler # scales data to [-1, 1]; for data already centered at zero, or sparse data
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.dummy import DummyClassifier



In [3]:

    
# Initialize model session, pass reference to pickled dataset
dataset_path = os.path.join(p.DATADIR, 'innocentive_dummified_96406_308.p')
m = p.ModelSession(dataset_path, testsize=0.2, pcasets=False)



In [4]:

    
# Initialize SheetsApi instance as attribute of ModelSession instance
m.ss = p.SheetsApi(spreadsheetid = '1dG5lQfqthqshz45Rs94VLSSWmSrS60b1iw7cT4Rqevs',
                   scopelist = ['https://www.googleapis.com/auth/spreadsheets',
                                'https://www.googleapis.com/auth/drive.metadata'])
m.ss.authenticate()
m.ss.info = m.ss.get_ss_info()
m.ss.sheets = m.ss.load_sheets(['session_report', 'cv_results', 'model_types'])

#Lookup names of model_types to use when reporting on model performance
m.lookup_model_types = dict(m.ss.sheets['model_types'].to_dict(orient='split')['data'])









    



prospecting.api: INFO     Reading discovery file for sheets:v4
prospecting.api: INFO     Authenticating...sheets, v4
prospecting.api: INFO     Getting credentials...
prospecting.api: INFO     Building service object...
prospecting.api: INFO     Service object built...<googleapiclient.discovery.Resource object at 0x000002C2C02F4438>
prospecting.api: INFO     Successfully authenticated...sheets, v4
prospecting.api: INFO     Spreadsheet loaded.
prospecting.api: INFO     Sheets include: ['session_report', 'cv_results', 'BACKUP cv_results v3', 'BACKUP cv_results_v2', 'BACKUP cv_results', 'model_types', '_plots', 'BACKUP_aggregate', '_aggregate_pipeline-pivot', '_Pivot Table 2', '_cell_count']



In [5]:

    
# model_config (list):  List of config dictionaries to pass to grid_search; config dictionaries are comprised of:
#     pipesteps (list):  List of tuples representing pipeline
#     params (dict):  Dictionary of pipeline params to use in GridSearchCV
#     scoring (str):  Scoring type to use in GridSearchCV
#     datasets (list):  List of strings identifying datasets to use, datasets must be attributes of ModelSession
#     n_jobs (int):  Number of cores to use during GridSearchCV, default=3 (optional)

model_config = [{'pipesteps': [('stdsc', StandardScaler()), ('lr', LogisticRegression())],
                 'params': {'lr__C':[0.25],
                            'lr__penalty':['l1', 'l2'],
                            'lr__class_weight':[None, 'balanced']},
                 'scoring': 'roc_auc',
                 'datasets': ['X_train']},
                {'pipesteps': [('mmsc', MinMaxScaler()), ('lr', LogisticRegression())],
                 'params': {'lr__C':[0.5],
                            'lr__penalty':['l1', 'l2'],
                            'lr__class_weight':[None, 'balanced']},
                 'scoring': 'roc_auc',
                 'datasets': ['X_train']},
                {'pipesteps': [('masc', MaxAbsScaler()), ('lr', LogisticRegression())],
                 'params': {'lr__C':[0.01, 0.1, 0.5, 1],
                            'lr__penalty':['l1', 'l2'],
                            'lr__class_weight':[None, 'balanced']},
                 'scoring': 'roc_auc',
                 'datasets': ['X_train']},
                {'pipesteps': [('rfc', RandomForestClassifier())],
                 'params': {'rfc__n_estimators':[250, 300],
                            'rfc__max_depth':[19, 25]},
                 'scoring': 'roc_auc',
                 'datasets': ['X_train']}
               ]



In [6]:

    
len(model_config)









    Out[6]:





4



In [7]:

    
p.model.grid_search(m, model_config)









    



Fitting 3 folds for each of 4 candidates, totalling 12 fits






    



[Parallel(n_jobs=3)]: Done  12 out of  12 | elapsed:  9.0min finished






    



prospecting.report: INFO     Appending df_cv_results to cv_results tab in spreadsheet.
prospecting.api: INFO     Append Successful!
prospecting.model: INFO     Confusion matrix:
[[6679 2917]
 [2571 7115]]
prospecting.api: INFO     Append Successful!
prospecting.report: INFO     Report complete in 0.3550238609313965 seconds
Fitting 3 folds for each of 4 candidates, totalling 12 fits






    



[Parallel(n_jobs=3)]: Done  12 out of  12 | elapsed: 10.4min finished






    



prospecting.report: INFO     Appending df_cv_results to cv_results tab in spreadsheet.
prospecting.api: INFO     Append Successful!
prospecting.model: INFO     Confusion matrix:
[[6687 2909]
 [2576 7110]]
prospecting.api: INFO     Append Successful!
prospecting.report: INFO     Report complete in 0.34902143478393555 seconds
Fitting 3 folds for each of 16 candidates, totalling 48 fits






    



[Parallel(n_jobs=3)]: Done  48 out of  48 | elapsed:  7.4min finished






    



prospecting.report: INFO     Appending df_cv_results to cv_results tab in spreadsheet.
prospecting.api: INFO     Append Successful!
prospecting.model: INFO     Confusion matrix:
[[6683 2913]
 [2567 7119]]
prospecting.api: INFO     Append Successful!
prospecting.report: INFO     Report complete in 0.3020155429840088 seconds
Fitting 3 folds for each of 4 candidates, totalling 12 fits






    



[Parallel(n_jobs=3)]: Done  12 out of  12 | elapsed: 11.0min finished






    



prospecting.report: INFO     Appending df_cv_results to cv_results tab in spreadsheet.
prospecting.api: INFO     Append Successful!
prospecting.model: INFO     Confusion matrix:
[[6368 3228]
 [2096 7590]]
prospecting.api: INFO     Append Successful!
prospecting.report: INFO     Report complete in 0.3010680675506592 seconds

APPENDIX



In [ ]:

    
# To create PCA plots, exclude pcasets=False from m = p.ModelSession(dataset_path, testsize=0.2)
plots_dir = 'C:\\Users\\Reid\\Google Drive\\projects\\innocentive\\plots'
p.report.plot_pca_expvar_to_pdf(m, plots_dir)