combined_sig_df.pkl https://storage.googleapis.com/issue_label_bot/notebook_files/combined_sig_df.pkl
feat_df.csv https://storage.googleapis.com/issue_label_bot/notebook_files/feat_df.csv
In [337]:
import pandas as pd
from inference import InferenceWrapper, pass_through
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import roc_auc_score
from IPython.display import display
import os
import torch
from torch.cuda import empty_cache
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="2"
combined_sig_df = pd.read_pickle('combined_sig_df.pkl')
feat_df = pd.read_csv('feat_df.csv')
train_mask = combined_sig_df.part != 6
holdout_mask = ~train_mask
In [338]:
# count the labels in the holdout set
from collections import Counter
c = Counter()
for row in combined_sig_df[combined_sig_df.part == 6].labels:
c.update(row)
In [339]:
X = feat_df[train_mask].values
X.shape
Out[339]:
In [340]:
label_columns = [x for x in combined_sig_df.columns if 'sig/' in x]
y = combined_sig_df[label_columns][train_mask].values
y.shape
Out[340]:
In [341]:
X_holdout = feat_df[holdout_mask].values
y_holdout = combined_sig_df[label_columns][holdout_mask].values
In [342]:
def calculate_auc(predictions):
auc_scores = []
counts = []
for i, l in enumerate(label_columns):
y_hat = predictions[:, i]
y = y_holdout[:, i]
auc = roc_auc_score(y_true=y, y_score=y_hat)
auc_scores.append(auc)
counts.append(c[l])
df = pd.DataFrame({'label': label_columns, 'auc': auc_scores, 'count': counts})
display(df)
weightedavg_auc = df.apply(lambda x: x.auc * x['count'], axis=1).sum() / df['count'].sum()
print(f'Weighted Average AUC: {weightedavg_auc}')
return df, weightedavg_auc
In [50]:
def shallow_model(l1=.01, l2=.01):
inp = Input(shape=(1600,))
x = Dense(units=30)(inp)
out = Dense(units=28, activation='sigmoid')(x)
model = Model(inputs=inp, outputs=out)
model.compile(optimizer=Adam(lr=.001), loss='categorical_crossentropy')
return model
shallow_model = shallow_model()
In [51]:
shallow_model.fit(x=X, y=y, batch_size=64, epochs=50, validation_split=.15)
Out[51]:
In [27]:
shallow_model.fit(x=X, y=y, batch_size=64, epochs=1, validation_split=0)
Out[27]:
In [ ]:
y_hat_holdout = shallow_model.predict(X_holdout)
In [41]:
auc_scores = []
for i, l in enumerate(label_columns):
y_hat = y_hat_holdout[:, i]
y = y_holdout[:, i]
auc = roc_auc_score(y_true=y, y_score=y_hat)
auc_scores.append(auc)
In [42]:
pd.DataFrame({'label': label_columns, 'auc': auc_scores})
Out[42]:
wow that sucks, lets try something else
In [11]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(early_stopping=True, n_iter_no_change=5, max_iter=500, solver='adam',
random_state=1234)
In [12]:
mlp.fit(X, y)
Out[12]:
In [15]:
mlp_predictions = mlp.predict_proba(X_holdout)
In [55]:
mlp_df, mlp_auc = calculate_auc(mlp_predictions)
In [343]:
params = {'hidden_layer_sizes': [(100,), (200,), (400, ), (50, 50), (100, 100), (200, 200)],
'alpha': [.001, .01, .1, 1, 10],
'learning_rate': ['constant', 'adaptive'],
'learning_rate_init': [.001, .01, .1]}
mlp_clf = MLPClassifier(early_stopping=True, validation_fraction=.2, n_iter_no_change=4, max_iter=500)
gscvmlp = GridSearchCV(mlp_clf, params, cv=5, n_jobs=-1)
gscvmlp.fit(X, y)
Out[343]:
In [344]:
print(f'The best model from grid search is:\n=====================================\n{gscvmlp.best_estimator_}')
In [347]:
mlp_tuned_predictions = gscvmlp.predict_proba(X_holdout)
In [348]:
mlp_tuned_df, mlp_tuned_auc = calculate_auc(mlp_tuned_predictions)
In [56]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import NuSVC
rf = RandomForestClassifier(n_estimators=300,
random_state=1234, min_samples_leaf=3, class_weight='balanced')
clf = OneVsRestClassifier(rf, n_jobs=-1)
In [57]:
clf.fit(X, y)
Out[57]:
In [58]:
rf_predictions = clf.predict_proba(X_holdout)
In [59]:
rf_df, rf_auc = calculate_auc(rf_predictions)
In [60]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(weights='distance', n_neighbors=10, n_jobs=-1)
In [61]:
knn.fit(X, y)
Out[61]:
In [62]:
knn_preds = knn.predict_proba(X_holdout)
In [63]:
import numpy as np
knn_preds_stacked = np.stack([x[:, 1] for x in knn_preds], axis=0).T
In [64]:
knn_preds_stacked
Out[64]:
In [65]:
knn_df, knn_auc = calculate_auc(knn_preds_stacked)
In [66]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.multiclass import OneVsRestClassifier
gbm = GradientBoostingClassifier(max_depth=5, min_samples_leaf=3, max_features='auto', n_iter_no_change=4)
clf = OneVsRestClassifier(gbm, n_jobs=-1)
In [67]:
clf.fit(X, y)
Out[67]:
In [68]:
gbm_predictions = clf.predict_proba(X_holdout)
In [69]:
gbm_df, gbm_auc = calculate_auc(gbm_predictions)
Note: this model was completely trained in another notebook, it is only evaluated here.
Model is available for download here:
https://storage.googleapis.com/issue_label_bot/model/multi_class_model/export.pkl
In [171]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="2"
from inference import InferenceWrapper, pass_through
iw = InferenceWrapper(model_path='/ds/multi_class_model/',
model_file_name='/ds/multi_class_model/export.pkl')
In [174]:
#because the pre-trained mdoel predicts more than just sig/ labels
pred_mask = [x in label_columns for x in iw.learn.data.classes]
Do pre-processing (such as markdown parsing) to prepare data for model.
In [111]:
parsed_df = iw.process_df(combined_sig_df)
In [355]:
holdout_text = parsed_df[holdout_mask]
In [356]:
lang_model_predict = np.stack(holdout_text.text.apply(lambda x: tcl.predict(x)[2].numpy()[pred_mask]).values)
In [359]:
lang_model_predict.shape
Out[359]:
In [362]:
len(iw.learn.data.classes)
Out[362]:
In [365]:
np.array(iw.learn.data.classes)[pred_mask]
Out[365]:
In [366]:
lang_model_predict_df = pd.DataFrame(lang_model_predict)
lang_model_predict_df.columns = np.array(iw.learn.data.classes)[pred_mask]
lm_df = lang_model_predict_df[[x for x in label_columns if x in lang_model_predict_df.columns]]
There are some columns that the model was not trained on
In [367]:
missing_cols = [x for x in label_columns if x not in lm_df.columns]
In [368]:
for col in missing_cols:
lm_df[col] = 0.0
In [371]:
lm_df, lm_auc = calculate_auc(lm_df.values)
In [372]:
missing_cols
Out[372]:
Count Vectorizer w/ Nueral Net
Choose Nueral Net because supports multi-label classification natively
In [307]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
train_mask = combined_sig_df.part != 6
parameters = {'hidden_layer_sizes': [(50,), (100,), (200,), (400, ), (50, 50), (100, 100)],
'alpha': [0.0001, .001, .01, .1, 1, 10],
'learning_rate': ['constant', 'adaptive'],
'learning_rate_init': [.001, .01]}
mlp = MLPClassifier(early_stopping=True, validation_fraction=.2, n_iter_no_change=4, max_iter=500)
In [308]:
class ColumnSelector(BaseEstimator, TransformerMixin):
def __init__(self, column):
self.column = column
def fit(self, X, y=None):
return self
def transform(self, X):
assert isinstance(X, pd.DataFrame)
try:
return X[self.column].values
except KeyError:
col_error = list(set(self.column) - set(X.column))
raise KeyError("The DataFrame does not include the columns: %s" % col_error)
In [309]:
model_pipeline = make_pipeline(
FeatureUnion(transformer_list=[
("title", make_pipeline(
ColumnSelector("title"),
CountVectorizer(ngram_range=(1,3), min_df=3, strip_accents='unicode'),
)),
("body", make_pipeline(
ColumnSelector("body"),
CountVectorizer(ngram_range=(1,3), min_df=3, strip_accents='unicode'),
))
]),
GridSearchCV(mlp, parameters, cv=5, n_jobs=-1)
)
In [310]:
y_train = combined_sig_df[label_columns][train_mask].values
x_train_df = combined_sig_df[['body', 'title']][train_mask]
In [311]:
model_pipeline.fit(x_train_df, y_train)
Out[311]:
In [329]:
best_model = model_pipeline.named_steps['gridsearchcv'].best_estimator_
print(f'The best model from grid search is:\n=====================================\n{best_model}')
In [332]:
holdout_mask = combined_sig_df.part == 6
x_holdout_df = combined_sig_df[['body', 'title']][holdout_mask]
y_holdout = combined_sig_df[label_columns][holdout_mask].values
In [335]:
no_deep_mlp_preds = model_pipeline.predict_proba(x_holdout_df)
In [336]:
no_deep_mlp_df, no_deep_mlp_auc = calculate_auc(no_deep_mlp_preds)
In [ ]:
In [ ]: