In [337]:
import pandas as pd
from inference import InferenceWrapper, pass_through
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import roc_auc_score
from IPython.display import display

import os
import torch
from torch.cuda import empty_cache
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="2"

combined_sig_df = pd.read_pickle('combined_sig_df.pkl')
feat_df = pd.read_csv('feat_df.csv')

train_mask = combined_sig_df.part != 6
holdout_mask = ~train_mask

In [338]:
# count the labels in the holdout set
from collections import Counter
c = Counter()

for row in combined_sig_df[combined_sig_df.part == 6].labels:
    c.update(row)

In [339]:
X = feat_df[train_mask].values
X.shape


Out[339]:
(7236, 1600)

In [340]:
label_columns = [x for x in combined_sig_df.columns if 'sig/' in x]
y = combined_sig_df[label_columns][train_mask].values
y.shape


Out[340]:
(7236, 28)

In [341]:
X_holdout = feat_df[holdout_mask].values
y_holdout = combined_sig_df[label_columns][holdout_mask].values

In [342]:
def calculate_auc(predictions):
    auc_scores = []
    counts = []

    for i, l in enumerate(label_columns):
        y_hat = predictions[:, i]
        y = y_holdout[:, i]
        auc = roc_auc_score(y_true=y, y_score=y_hat)
        auc_scores.append(auc)
        counts.append(c[l])
    
    df = pd.DataFrame({'label': label_columns, 'auc': auc_scores, 'count': counts})    
    display(df)
    weightedavg_auc = df.apply(lambda x: x.auc * x['count'], axis=1).sum() / df['count'].sum()
    print(f'Weighted Average AUC: {weightedavg_auc}')
    return df, weightedavg_auc

Part 1: Feed Embeddings From Language Model To Downstream Algorithims and Do Greedy Training On Top of That

Keras


In [50]:
def shallow_model(l1=.01, l2=.01):

    inp = Input(shape=(1600,))
    x = Dense(units=30)(inp)
    out = Dense(units=28, activation='sigmoid')(x)
    model = Model(inputs=inp, outputs=out)
    model.compile(optimizer=Adam(lr=.001), loss='categorical_crossentropy')
    return model

shallow_model = shallow_model()

In [51]:
shallow_model.fit(x=X, y=y, batch_size=64, epochs=50, validation_split=.15)


Train on 6150 samples, validate on 1086 samples
Epoch 1/50
6150/6150 [==============================] - 1s 198us/step - loss: 3.2999 - val_loss: 3.0119
Epoch 2/50
6150/6150 [==============================] - 0s 76us/step - loss: 2.8669 - val_loss: 2.6908
Epoch 3/50
6150/6150 [==============================] - 0s 76us/step - loss: 2.4282 - val_loss: 2.2630
Epoch 4/50
6150/6150 [==============================] - 0s 74us/step - loss: 2.1039 - val_loss: 2.1075
Epoch 5/50
6150/6150 [==============================] - 0s 74us/step - loss: 1.9807 - val_loss: 2.0253
Epoch 6/50
6150/6150 [==============================] - 0s 76us/step - loss: 1.9082 - val_loss: 1.9793
Epoch 7/50
6150/6150 [==============================] - 0s 76us/step - loss: 1.8464 - val_loss: 1.9242
Epoch 8/50
6150/6150 [==============================] - 0s 75us/step - loss: 1.7821 - val_loss: 1.9131
Epoch 9/50
6150/6150 [==============================] - 0s 74us/step - loss: 1.7433 - val_loss: 1.8881
Epoch 10/50
6150/6150 [==============================] - 0s 74us/step - loss: 1.7132 - val_loss: 1.8841
Epoch 11/50
6150/6150 [==============================] - 0s 75us/step - loss: 1.6824 - val_loss: 1.8497
Epoch 12/50
6150/6150 [==============================] - 0s 75us/step - loss: 1.6411 - val_loss: 1.8140
Epoch 13/50
6150/6150 [==============================] - 0s 76us/step - loss: 1.6281 - val_loss: 1.8346
Epoch 14/50
6150/6150 [==============================] - 0s 74us/step - loss: 1.6011 - val_loss: 1.8096
Epoch 15/50
6150/6150 [==============================] - 0s 75us/step - loss: 1.5753 - val_loss: 1.8423
Epoch 16/50
6150/6150 [==============================] - 0s 76us/step - loss: 1.5611 - val_loss: 1.8438
Epoch 17/50
6150/6150 [==============================] - 0s 76us/step - loss: 1.5494 - val_loss: 1.7939
Epoch 18/50
6150/6150 [==============================] - 0s 74us/step - loss: 1.5422 - val_loss: 1.8161
Epoch 19/50
6150/6150 [==============================] - 0s 75us/step - loss: 1.5092 - val_loss: 1.7943
Epoch 20/50
6150/6150 [==============================] - 0s 74us/step - loss: 1.4912 - val_loss: 1.7744
Epoch 21/50
6150/6150 [==============================] - 0s 76us/step - loss: 1.4816 - val_loss: 1.7901
Epoch 22/50
6150/6150 [==============================] - 0s 74us/step - loss: 1.4533 - val_loss: 1.7697
Epoch 23/50
6150/6150 [==============================] - 0s 76us/step - loss: 1.4489 - val_loss: 1.8118
Epoch 24/50
6150/6150 [==============================] - 0s 74us/step - loss: 1.4428 - val_loss: 1.8678
Epoch 25/50
6150/6150 [==============================] - 0s 75us/step - loss: 1.4349 - val_loss: 1.8378
Epoch 26/50
6150/6150 [==============================] - 0s 75us/step - loss: 1.4039 - val_loss: 1.7674
Epoch 27/50
6150/6150 [==============================] - 0s 74us/step - loss: 1.3905 - val_loss: 1.8125
Epoch 28/50
6150/6150 [==============================] - 0s 74us/step - loss: 1.3730 - val_loss: 1.7775
Epoch 29/50
6150/6150 [==============================] - 0s 74us/step - loss: 1.3601 - val_loss: 1.7703
Epoch 30/50
6150/6150 [==============================] - 0s 76us/step - loss: 1.3647 - val_loss: 1.8213
Epoch 31/50
6150/6150 [==============================] - 0s 76us/step - loss: 1.3504 - val_loss: 1.7999
Epoch 32/50
6150/6150 [==============================] - 0s 76us/step - loss: 1.3347 - val_loss: 1.8245
Epoch 33/50
6150/6150 [==============================] - 0s 74us/step - loss: 1.3230 - val_loss: 1.7669
Epoch 34/50
6150/6150 [==============================] - 0s 75us/step - loss: 1.3056 - val_loss: 1.8174
Epoch 35/50
6150/6150 [==============================] - 0s 75us/step - loss: 1.3048 - val_loss: 1.7797
Epoch 36/50
6150/6150 [==============================] - 0s 75us/step - loss: 1.2938 - val_loss: 1.7946
Epoch 37/50
6150/6150 [==============================] - 0s 75us/step - loss: 1.2679 - val_loss: 1.7625
Epoch 38/50
6150/6150 [==============================] - 0s 74us/step - loss: 1.2675 - val_loss: 1.8138
Epoch 39/50
6150/6150 [==============================] - 0s 76us/step - loss: 1.2656 - val_loss: 1.8448
Epoch 40/50
6150/6150 [==============================] - 0s 74us/step - loss: 1.2668 - val_loss: 1.7733
Epoch 41/50
6150/6150 [==============================] - 0s 76us/step - loss: 1.2347 - val_loss: 1.7633
Epoch 42/50
6150/6150 [==============================] - 0s 76us/step - loss: 1.2229 - val_loss: 1.8056
Epoch 43/50
6150/6150 [==============================] - 0s 75us/step - loss: 1.2238 - val_loss: 1.8188
Epoch 44/50
6150/6150 [==============================] - 0s 74us/step - loss: 1.2157 - val_loss: 1.7920
Epoch 45/50
6150/6150 [==============================] - 0s 75us/step - loss: 1.1974 - val_loss: 1.8284
Epoch 46/50
6150/6150 [==============================] - 0s 75us/step - loss: 1.1955 - val_loss: 1.7944
Epoch 47/50
6150/6150 [==============================] - 0s 76us/step - loss: 1.1729 - val_loss: 1.9240
Epoch 48/50
6150/6150 [==============================] - 0s 76us/step - loss: 1.1914 - val_loss: 1.8080
Epoch 49/50
6150/6150 [==============================] - 0s 78us/step - loss: 1.1582 - val_loss: 1.8158
Epoch 50/50
6150/6150 [==============================] - 0s 75us/step - loss: 1.1592 - val_loss: 1.8506
Out[51]:
<tensorflow.python.keras.callbacks.History at 0x7f4e3f7939e8>

In [27]:
shallow_model.fit(x=X, y=y, batch_size=64, epochs=1, validation_split=0)


Epoch 1/1
7236/7236 [==============================] - 0s 65us/step - loss: 1.3368
Out[27]:
<tensorflow.python.keras.callbacks.History at 0x7f4ad6885c18>

In [ ]:
y_hat_holdout = shallow_model.predict(X_holdout)

In [41]:
auc_scores = []

for i, l in enumerate(label_columns):
    y_hat = y_hat_holdout[:, i]
    y = y_holdout[:, i]
    auc = roc_auc_score(y_true=y, y_score=y_hat)
    auc_scores.append(auc)

In [42]:
pd.DataFrame({'label': label_columns, 'auc': auc_scores})


Out[42]:
label auc
0 sig/cluster-lifecycle 0.666893
1 sig/node 0.718941
2 sig/api-machinery 0.787740
3 sig/scalability 0.814389
4 sig/cli 0.840884
5 sig/autoscaling 0.849214
6 sig/network 0.786889
7 sig/cloud-provider 0.741232
8 sig/storage 0.880883
9 sig/scheduling 0.813724
10 sig/apps 0.729923
11 sig/windows 0.909583
12 sig/auth 0.826116
13 sig/docs 0.926942
14 sig/testing 0.802075
15 sig/federation 0.871224
16 sig/gcp 0.688450
17 sig/release 0.891123
18 sig/azure 0.855416
19 sig/aws 0.856246
20 sig/cluster-ops 0.551113
21 sig/multicluster 0.909299
22 sig/instrumentation 0.863066
23 sig/openstack 0.846665
24 sig/contributor-experience 0.857320
25 sig/architecture 0.717019
26 sig/vmware 0.816169
27 sig/service-catalog 0.682101

wow that sucks, lets try something else

Sklearn Nueral Network

b/c multi-label is supported naturally


In [11]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(early_stopping=True, n_iter_no_change=5, max_iter=500, solver='adam', 
                   random_state=1234)

In [12]:
mlp.fit(X, y)


Out[12]:
MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=True, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       n_iter_no_change=5, nesterovs_momentum=True, power_t=0.5,
       random_state=1234, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [15]:
mlp_predictions = mlp.predict_proba(X_holdout)

In [55]:
mlp_df, mlp_auc = calculate_auc(mlp_predictions)


label auc count
0 sig/cluster-lifecycle 0.863932 498
1 sig/node 0.884496 1311
2 sig/api-machinery 0.892453 1090
3 sig/scalability 0.907244 258
4 sig/cli 0.935913 544
5 sig/autoscaling 0.949778 100
6 sig/network 0.945694 923
7 sig/cloud-provider 0.934848 29
8 sig/storage 0.965592 824
9 sig/scheduling 0.926638 397
10 sig/apps 0.893835 440
11 sig/windows 0.973496 84
12 sig/auth 0.952659 292
13 sig/docs 0.967213 110
14 sig/testing 0.908711 361
15 sig/federation 0.952171 85
16 sig/gcp 0.808062 98
17 sig/release 0.947304 158
18 sig/azure 0.966767 157
19 sig/aws 0.942163 217
20 sig/cluster-ops 0.733281 31
21 sig/multicluster 0.953977 81
22 sig/instrumentation 0.938337 124
23 sig/openstack 0.946715 70
24 sig/contributor-experience 0.917553 72
25 sig/architecture 0.848095 52
26 sig/vmware 0.923500 20
27 sig/service-catalog 0.736944 21
Weighted Average AUC: 0.9168608333252417

Try Tuning the MLP


In [343]:
params = {'hidden_layer_sizes': [(100,), (200,), (400, ), (50, 50), (100, 100), (200, 200)],
              'alpha': [.001, .01, .1, 1, 10],
              'learning_rate': ['constant', 'adaptive'],
              'learning_rate_init': [.001, .01, .1]}
              
mlp_clf = MLPClassifier(early_stopping=True, validation_fraction=.2, n_iter_no_change=4, max_iter=500)

gscvmlp = GridSearchCV(mlp_clf, params, cv=5, n_jobs=-1)

gscvmlp.fit(X, y)


Out[343]:
GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=True, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       n_iter_no_change=4, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.2, verbose=False, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'hidden_layer_sizes': [(100,), (200,), (400,), (50, 50), (100, 100), (200, 200)], 'alpha': [0.001, 0.01, 0.1, 1, 10], 'learning_rate': ['constant', 'adaptive'], 'learning_rate_init': [0.001, 0.01, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [344]:
print(f'The best model from grid search is:\n=====================================\n{gscvmlp.best_estimator_}')


The best model from grid search is:
=====================================
MLPClassifier(activation='relu', alpha=0.001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=True, epsilon=1e-08,
       hidden_layer_sizes=(200, 200), learning_rate='adaptive',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       n_iter_no_change=4, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.2, verbose=False, warm_start=False)

In [347]:
mlp_tuned_predictions = gscvmlp.predict_proba(X_holdout)

In [348]:
mlp_tuned_df, mlp_tuned_auc = calculate_auc(mlp_tuned_predictions)


label auc count
0 sig/cluster-lifecycle 0.861357 498
1 sig/node 0.886055 1311
2 sig/api-machinery 0.893178 1090
3 sig/scalability 0.897738 258
4 sig/cli 0.934423 544
5 sig/autoscaling 0.943365 100
6 sig/network 0.946594 923
7 sig/cloud-provider 0.891243 29
8 sig/storage 0.966184 824
9 sig/scheduling 0.925841 397
10 sig/apps 0.900364 440
11 sig/windows 0.964474 84
12 sig/auth 0.956964 292
13 sig/docs 0.956466 110
14 sig/testing 0.915775 361
15 sig/federation 0.935742 85
16 sig/gcp 0.813389 98
17 sig/release 0.941731 158
18 sig/azure 0.958326 157
19 sig/aws 0.946252 217
20 sig/cluster-ops 0.784832 31
21 sig/multicluster 0.941450 81
22 sig/instrumentation 0.935952 124
23 sig/openstack 0.913931 70
24 sig/contributor-experience 0.912893 72
25 sig/architecture 0.753609 52
26 sig/vmware 0.934651 20
27 sig/service-catalog 0.781518 21
Weighted Average AUC: 0.9161311029647917

Sklearn Random Forest


In [56]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import NuSVC

rf = RandomForestClassifier(n_estimators=300,
                             random_state=1234, min_samples_leaf=3, class_weight='balanced')


clf = OneVsRestClassifier(rf, n_jobs=-1)

In [57]:
clf.fit(X, y)


Out[57]:
OneVsRestClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=3,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=300, n_jobs=None, oob_score=False,
            random_state=1234, verbose=0, warm_start=False),
          n_jobs=-1)

In [58]:
rf_predictions = clf.predict_proba(X_holdout)

In [59]:
rf_df, rf_auc = calculate_auc(rf_predictions)


label auc count
0 sig/cluster-lifecycle 0.837176 498
1 sig/node 0.863413 1311
2 sig/api-machinery 0.869993 1090
3 sig/scalability 0.890226 258
4 sig/cli 0.930573 544
5 sig/autoscaling 0.928560 100
6 sig/network 0.926477 923
7 sig/cloud-provider 0.944092 29
8 sig/storage 0.958161 824
9 sig/scheduling 0.898687 397
10 sig/apps 0.880016 440
11 sig/windows 0.965856 84
12 sig/auth 0.949775 292
13 sig/docs 0.951711 110
14 sig/testing 0.907261 361
15 sig/federation 0.917191 85
16 sig/gcp 0.868102 98
17 sig/release 0.936467 158
18 sig/azure 0.971380 157
19 sig/aws 0.940418 217
20 sig/cluster-ops 0.736016 31
21 sig/multicluster 0.928801 81
22 sig/instrumentation 0.912721 124
23 sig/openstack 0.935978 70
24 sig/contributor-experience 0.874981 72
25 sig/architecture 0.825348 52
26 sig/vmware 0.882247 20
27 sig/service-catalog 0.678199 21
Weighted Average AUC: 0.9014719435122518

Sklearn KNN


In [60]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(weights='distance', n_neighbors=10, n_jobs=-1)

In [61]:
knn.fit(X, y)


Out[61]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=10, p=2,
           weights='distance')

In [62]:
knn_preds = knn.predict_proba(X_holdout)

In [63]:
import numpy as np
knn_preds_stacked = np.stack([x[:, 1] for x in knn_preds], axis=0).T

In [64]:
knn_preds_stacked


Out[64]:
array([[0.      , 0.397812, 0.      , 0.      , ..., 0.      , 0.      , 0.      , 0.      ],
       [0.201453, 0.096647, 0.099647, 0.      , ..., 0.      , 0.      , 0.      , 0.      ],
       [0.      , 0.202347, 0.201023, 0.097884, ..., 0.      , 0.      , 0.      , 0.      ],
       [0.      , 0.093967, 0.50943 , 0.      , ..., 0.      , 0.      , 0.      , 0.      ],
       ...,
       [0.095501, 0.      , 0.      , 0.      , ..., 0.      , 0.      , 0.      , 0.      ],
       [0.411118, 0.296557, 0.      , 0.      , ..., 0.      , 0.      , 0.      , 0.      ],
       [0.      , 0.097659, 0.51011 , 0.      , ..., 0.      , 0.      , 0.      , 0.      ],
       [0.09672 , 0.096316, 0.      , 0.404727, ..., 0.      , 0.      , 0.      , 0.      ]])

In [65]:
knn_df, knn_auc = calculate_auc(knn_preds_stacked)


label auc count
0 sig/cluster-lifecycle 0.759911 498
1 sig/node 0.802074 1311
2 sig/api-machinery 0.818665 1090
3 sig/scalability 0.819704 258
4 sig/cli 0.869074 544
5 sig/autoscaling 0.792181 100
6 sig/network 0.884401 923
7 sig/cloud-provider 0.657295 29
8 sig/storage 0.919821 824
9 sig/scheduling 0.826708 397
10 sig/apps 0.787182 440
11 sig/windows 0.824049 84
12 sig/auth 0.850271 292
13 sig/docs 0.774042 110
14 sig/testing 0.802061 361
15 sig/federation 0.782056 85
16 sig/gcp 0.689234 98
17 sig/release 0.838163 158
18 sig/azure 0.839639 157
19 sig/aws 0.845231 217
20 sig/cluster-ops 0.540976 31
21 sig/multicluster 0.798270 81
22 sig/instrumentation 0.785553 124
23 sig/openstack 0.732005 70
24 sig/contributor-experience 0.726007 72
25 sig/architecture 0.669761 52
26 sig/vmware 0.741737 20
27 sig/service-catalog 0.562730 21
Weighted Average AUC: 0.8253041170693705

Sklearn GBM


In [66]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.multiclass import OneVsRestClassifier
gbm = GradientBoostingClassifier(max_depth=5, min_samples_leaf=3, max_features='auto', n_iter_no_change=4)
clf = OneVsRestClassifier(gbm, n_jobs=-1)

In [67]:
clf.fit(X, y)


Out[67]:
OneVsRestClassifier(estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=5,
              max_features='auto', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=3, min_sam...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
          n_jobs=-1)

In [68]:
gbm_predictions = clf.predict_proba(X_holdout)

In [69]:
gbm_df, gbm_auc = calculate_auc(gbm_predictions)


label auc count
0 sig/cluster-lifecycle 0.800617 498
1 sig/node 0.854387 1311
2 sig/api-machinery 0.872441 1090
3 sig/scalability 0.852579 258
4 sig/cli 0.925437 544
5 sig/autoscaling 0.862118 100
6 sig/network 0.923715 923
7 sig/cloud-provider 0.551281 29
8 sig/storage 0.954871 824
9 sig/scheduling 0.879484 397
10 sig/apps 0.854226 440
11 sig/windows 0.909860 84
12 sig/auth 0.925322 292
13 sig/docs 0.910010 110
14 sig/testing 0.890536 361
15 sig/federation 0.829683 85
16 sig/gcp 0.647451 98
17 sig/release 0.900076 158
18 sig/azure 0.945144 157
19 sig/aws 0.919447 217
20 sig/cluster-ops 0.513176 31
21 sig/multicluster 0.856024 81
22 sig/instrumentation 0.858293 124
23 sig/openstack 0.724813 70
24 sig/contributor-experience 0.857862 72
25 sig/architecture 0.528693 52
26 sig/vmware 0.532468 20
27 sig/service-catalog 0.502904 21
Weighted Average AUC: 0.8766387996437228

Part 2: Fine Tune Original Language Model

With FastAI & Pytorch

Note: this model was completely trained in another notebook, it is only evaluated here.

Prepare Inference Wrapper


In [171]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="2"

from inference import InferenceWrapper, pass_through
iw = InferenceWrapper(model_path='/ds/multi_class_model/',
                      model_file_name='/ds/multi_class_model/export.pkl')

In [174]:
#because the pre-trained mdoel predicts more than just sig/ labels
pred_mask = [x in label_columns for x in iw.learn.data.classes]

Do pre-processing (such as markdown parsing) to prepare data for model.


In [111]:
parsed_df = iw.process_df(combined_sig_df)


/ds/notebooks/inference.py:62: UserWarning: DataFrame columns are not unique, some columns will be omitted.
  for d in tqdm(dataframe.to_dict(orient='rows')):


In [355]:
holdout_text = parsed_df[holdout_mask]

In [356]:
lang_model_predict = np.stack(holdout_text.text.apply(lambda x: tcl.predict(x)[2].numpy()[pred_mask]).values)

In [359]:
lang_model_predict.shape


Out[359]:
(7154, 24)

In [362]:
len(iw.learn.data.classes)


Out[362]:
45

In [365]:
np.array(iw.learn.data.classes)[pred_mask]


Out[365]:
array(['sig/api-machinery', 'sig/apps', 'sig/architecture', 'sig/auth', 'sig/autoscaling', 'sig/aws', 'sig/azure',
       'sig/cli', 'sig/cloud-provider', 'sig/cluster-lifecycle', 'sig/contributor-experience', 'sig/docs', 'sig/gcp',
       'sig/instrumentation', 'sig/multicluster', 'sig/network', 'sig/node', 'sig/openstack', 'sig/release',
       'sig/scalability', 'sig/scheduling', 'sig/storage', 'sig/testing', 'sig/windows'], dtype='<U31')

In [366]:
lang_model_predict_df = pd.DataFrame(lang_model_predict)
lang_model_predict_df.columns = np.array(iw.learn.data.classes)[pred_mask]
lm_df = lang_model_predict_df[[x for x in label_columns if x in lang_model_predict_df.columns]]

There are some columns that the model was not trained on


In [367]:
missing_cols = [x for x in label_columns if x not in lm_df.columns]

In [368]:
for col in missing_cols:
    lm_df[col] = 0.0


/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  

In [371]:
lm_df, lm_auc = calculate_auc(lm_df.values)


label auc count
0 sig/cluster-lifecycle 0.855230 498
1 sig/node 0.874241 1311
2 sig/api-machinery 0.888351 1090
3 sig/scalability 0.910308 258
4 sig/cli 0.941818 544
5 sig/autoscaling 0.956396 100
6 sig/network 0.942258 923
7 sig/cloud-provider 0.966819 29
8 sig/storage 0.966369 824
9 sig/scheduling 0.926945 397
10 sig/apps 0.873194 440
11 sig/windows 0.979927 84
12 sig/auth 0.957731 292
13 sig/docs 0.967435 110
14 sig/testing 0.920269 361
15 sig/federation 0.529352 85
16 sig/gcp 0.642905 98
17 sig/release 0.385389 158
18 sig/azure 0.613607 157
19 sig/aws 0.456982 217
20 sig/cluster-ops 0.445762 31
21 sig/multicluster 0.713079 81
22 sig/instrumentation 0.450550 124
23 sig/openstack 0.434883 70
24 sig/contributor-experience 0.500000 72
25 sig/architecture 0.500000 52
26 sig/vmware 0.500000 20
27 sig/service-catalog 0.500000 21
Weighted Average AUC: 0.8564939002714045

In [372]:
missing_cols


Out[372]:
['sig/federation', 'sig/cluster-ops', 'sig/vmware', 'sig/service-catalog']

Part 3: Using Text & Bag of Words Instead of Pre-Trained Embeddings (Classic ML)

Count Vectorizer w/ Nueral Net

Choose Nueral Net because supports multi-label classification natively


In [307]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV


train_mask = combined_sig_df.part != 6

parameters = {'hidden_layer_sizes': [(50,), (100,), (200,), (400, ), (50, 50), (100, 100)],
              'alpha': [0.0001, .001, .01, .1, 1, 10],
              'learning_rate': ['constant', 'adaptive'],
              'learning_rate_init': [.001, .01]}
              
mlp = MLPClassifier(early_stopping=True, validation_fraction=.2, n_iter_no_change=4, max_iter=500)

In [308]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        assert isinstance(X, pd.DataFrame)

        try:
            return X[self.column].values
        except KeyError:
            col_error = list(set(self.column) - set(X.column))
            raise KeyError("The DataFrame does not include the columns: %s" % col_error)

In [309]:
model_pipeline = make_pipeline(
    FeatureUnion(transformer_list=[
        ("title", make_pipeline(
            ColumnSelector("title"),
            CountVectorizer(ngram_range=(1,3), min_df=3, strip_accents='unicode'),
        )),
        ("body", make_pipeline(
            ColumnSelector("body"),
            CountVectorizer(ngram_range=(1,3), min_df=3, strip_accents='unicode'),
        ))
    ]),
    GridSearchCV(mlp, parameters, cv=5, n_jobs=-1)
)

In [310]:
y_train = combined_sig_df[label_columns][train_mask].values
x_train_df = combined_sig_df[['body', 'title']][train_mask]

Fit the model


In [311]:
model_pipeline.fit(x_train_df, y_train)


Out[311]:
Pipeline(memory=None,
     steps=[('featureunion', FeatureUnion(n_jobs=None,
       transformer_list=[('title', Pipeline(memory=None,
     steps=[('columnselector', ColumnSelector(column='title')), ('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, en...   pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0))])

In [329]:
best_model = model_pipeline.named_steps['gridsearchcv'].best_estimator_
print(f'The best model from grid search is:\n=====================================\n{best_model}')


The best model from grid search is:
=====================================
MLPClassifier(activation='relu', alpha=0.01, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=True, epsilon=1e-08,
       hidden_layer_sizes=(100, 100), learning_rate='adaptive',
       learning_rate_init=0.01, max_iter=500, momentum=0.9,
       n_iter_no_change=4, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.2, verbose=False, warm_start=False)

Evaluate Model


In [332]:
holdout_mask = combined_sig_df.part == 6
x_holdout_df = combined_sig_df[['body', 'title']][holdout_mask]
y_holdout = combined_sig_df[label_columns][holdout_mask].values

In [335]:
no_deep_mlp_preds = model_pipeline.predict_proba(x_holdout_df)

In [336]:
no_deep_mlp_df, no_deep_mlp_auc = calculate_auc(no_deep_mlp_preds)


label auc count
0 sig/cluster-lifecycle 0.778223 498
1 sig/node 0.868424 1311
2 sig/api-machinery 0.869518 1090
3 sig/scalability 0.858837 258
4 sig/cli 0.903395 544
5 sig/autoscaling 0.914378 100
6 sig/network 0.899245 923
7 sig/cloud-provider 0.487758 29
8 sig/storage 0.934296 824
9 sig/scheduling 0.883158 397
10 sig/apps 0.836132 440
11 sig/windows 0.928812 84
12 sig/auth 0.872954 292
13 sig/docs 0.828384 110
14 sig/testing 0.804150 361
15 sig/federation 0.850266 85
16 sig/gcp 0.729609 98
17 sig/release 0.871105 158
18 sig/azure 0.895980 157
19 sig/aws 0.888297 217
20 sig/cluster-ops 0.487240 31
21 sig/multicluster 0.898379 81
22 sig/instrumentation 0.798121 124
23 sig/openstack 0.901613 70
24 sig/contributor-experience 0.746154 72
25 sig/architecture 0.574226 52
26 sig/vmware 0.692382 20
27 sig/service-catalog 0.532154 21
Weighted Average AUC: 0.8640413289631089

In [ ]:


In [ ]: