In [17]:
from google.cloud import bigquery
import pandas
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import scipy
from sklearn.preprocessing import MultiLabelBinarizer
from ast import literal_eval
In [2]:
client = bigquery.Client(location="US")
print("Client creating using default project: {}".format(client.project))
In [3]:
#query = """
# SELECT
# *
# FROM
# `ut-goog.nl_api.Karan_rolled_up_data`
#"""
#query_job = client.query(
# query,
# Location must match that of the dataset(s) referenced in the query.
# location="US",
#) # API request - starts the query
#complaints_df = query_job.to_dataframe()
#complaints_df.head()
Out[3]:
In [4]:
complaints_df['product'] = complaints_df['product'].astype('category')
complaints_df['subproduct'] = complaints_df['subproduct'].astype('category')
complaints_df['issue'] = complaints_df['issue'].astype('category')
complaints_df['subissue'] = complaints_df['subissue'].astype('category')
complaints_df['state'] = complaints_df['state'].astype('category')
complaints_df['zip_code'] = complaints_df['zip_code'].astype('category')
complaints_df['tags'] = complaints_df['tags'].astype('category')
variables = ['company_response_to_consumer',
'product',
'subproduct',
'issue',
'subissue',
'state',
'zip_code',
'tags',
'consumer_complaint_narrative']
complaints_df2 = complaints_df.copy()
complaints_df2 = complaints_df2[variables]
complaints_df2.dropna(subset=['company_response_to_consumer'],inplace=True)
one_hot1 = pd.get_dummies(complaints_df2['product'], drop_first=True)
one_hot2 = pd.get_dummies(complaints_df2['subproduct'], drop_first=True)
one_hot3 = pd.get_dummies(complaints_df2['issue'], drop_first=True)
one_hot4 = pd.get_dummies(complaints_df2['subissue'], drop_first=True)
one_hot5 = pd.get_dummies(complaints_df2['state'], drop_first=True)
one_hot6 = pd.get_dummies(complaints_df2['zip_code'], drop_first=True)
one_hot7 = pd.get_dummies(complaints_df2['tags'], drop_first=True)
complaints_df2 = complaints_df2.join(one_hot1, rsuffix='prod')
complaints_df2 = complaints_df2.join(one_hot2, rsuffix='subprod')
complaints_df2 = complaints_df2.join(one_hot3, rsuffix='issue')
complaints_df2 = complaints_df2.join(one_hot4, rsuffix='subissue')
complaints_df2 = complaints_df2.join(one_hot5)
#complaints_df2 = complaints_df2.join(one_hot6)
complaints_df2 = complaints_df2.join(one_hot7)
variable = ['company_response_to_consumer','product','subproduct','issue','subissue','state','zip_code','tags']
X = complaints_df2.drop(columns=variable)
y = complaints_df2['company_response_to_consumer']
#labels = y.unique()
#y = preprocessing.label_binarize(y, classes=labels)
In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)
X_train = X_train.drop(columns=['consumer_complaint_narrative'])
X_test = X_test.drop(columns=['consumer_complaint_narrative'])
In [6]:
parameters = {#'class_weight':[None,'balanced'],
#'max_depth':[2,4,6,8],
'n_estimators':[10,100,200,300],
'n_jobs':[-1]
}
rf = RandomForestClassifier()
clf = GridSearchCV(rf, parameters)
clf.get_params()
Out[6]:
In [ ]:
clf.fit(X_train, y_train)
In [8]:
clf.best_params_
Out[8]:
In [9]:
y_pred = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
y_score_test = clf.predict_proba(X_test)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
"test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n")
In [10]:
confusion_matrix(y_test,y_pred_test)
Out[10]:
In [7]:
def multiclass_roc_auc_score(truth, pred, average="macro"):
lb = LabelBinarizer()
lb.fit(truth)
truth = lb.transform(truth)
pred = lb.transform(pred)
return roc_auc_score(truth, pred, average=average)
In [12]:
print('multiclass auroc:',multiclass_roc_auc_score(y_test,y_pred_test))
print('f1 score:', f1_score(y_test, y_pred_test, average='weighted'))
In [8]:
nb = MultinomialNB()
In [14]:
nb.fit(X_train, y_train)
y_pred = nb.predict(X_train)
y_pred_test = nb.predict(X_test)
y_score_test = nb.predict_proba(X_test)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
"test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
"multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
"f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')
In [15]:
confusion_matrix(y_test,y_pred_test)
Out[15]:
In [9]:
X_text = complaints_df2.drop(columns=variable)
X_train, X_test, y_train, y_test = train_test_split(X_text, y, test_size=0.2, random_state=11)
train_text = X_train['consumer_complaint_narrative']
test_text = X_test['consumer_complaint_narrative']
In [17]:
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(complaints_df['consumer_complaint_narrative'])
xtrain_count = count_vect.transform(train_text)
xtest_count = count_vect.transform(test_text)
In [18]:
xtrain_count
Out[18]:
In [12]:
X_train2 = X_train.drop(columns=['consumer_complaint_narrative'])
X_test2 = X_test.drop(columns=['consumer_complaint_narrative'])
In [20]:
X_train_counts = scipy.sparse.hstack([xtrain_count, X_train2])
X_test_counts = scipy.sparse.hstack([xtest_count, X_test2])
Note that using the grid search will work on the following random forests, but will take a long time per random forest (several hours).
In [21]:
#rfc = RandomForestClassifier(n_estimators = 100)
In [22]:
clf.fit(X_train_counts, y_train)
y_pred = clf.predict(X_train_counts)
y_pred_test = clf.predict(X_test_counts)
y_score_test = clf.predict_proba(X_test_counts)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
"test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
"multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
"f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')
In [23]:
clf.best_params_
Out[23]:
Seems like it did not need many trees. Will reduce for the next ones.
In [9]:
parameters = {#'class_weight':[None,'balanced'],
#'max_depth':[2,4,6,8],
'n_estimators':[10,50,100],
'n_jobs':[-1]
}
rf = RandomForestClassifier()
clf = GridSearchCV(rf, parameters)
In [25]:
clf.fit(xtrain_count, y_train)
y_pred = clf.predict(xtrain_count)
y_pred_test = clf.predict(xtest_count)
y_score_test = clf.predict_proba(xtest_count)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
"test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
"multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
"f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')
In [26]:
confusion_matrix(y_test,y_pred_test)
Out[26]:
In [32]:
clf.best_params_
Out[32]:
In [27]:
nb.fit(X_train_counts, y_train)
y_pred = nb.predict(X_train_counts)
y_pred_test = nb.predict(X_test_counts)
y_score_test = nb.predict_proba(X_test_counts)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
"test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
"multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
"f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')
In [28]:
nb.fit(xtrain_count, y_train)
y_pred = nb.predict(xtrain_count)
y_pred_test = nb.predict(xtest_count)
y_score_test = nb.predict_proba(xtest_count)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
"test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
"multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
"f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')
In [29]:
confusion_matrix(y_test,y_pred_test)
Out[29]:
In [30]:
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(complaints_df['consumer_complaint_narrative'])
xtrain_tfidf = tfidf_vect.transform(train_text)
xtest_tfidf = tfidf_vect.transform(test_text)
In [31]:
X_train_tfidf = scipy.sparse.hstack([xtrain_tfidf, X_train2])
X_test_tfidf = scipy.sparse.hstack([xtest_tfidf, X_test2])
In [33]:
clf.fit(X_train_tfidf, y_train)
y_pred = clf.predict(X_train_tfidf)
y_pred_test = clf.predict(X_test_tfidf)
y_score_test = clf.predict_proba(X_test_tfidf)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
"test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
"multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
"f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')
In [34]:
confusion_matrix(y_test,y_pred_test)
Out[34]:
In [35]:
clf.fit(xtrain_tfidf, y_train)
y_pred = clf.predict(xtrain_tfidf)
y_pred_test = clf.predict(xtest_tfidf)
y_score_test = clf.predict_proba(xtest_tfidf)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
"test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
"multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
"f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')
In [36]:
confusion_matrix(y_test,y_pred_test)
Out[36]:
In [37]:
nb.fit(X_train_tfidf, y_train)
y_pred = nb.predict(X_train_tfidf)
y_pred_test = nb.predict(X_test_tfidf)
y_score_test = nb.predict_proba(X_test_tfidf)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
"test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
"multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
"f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')
In [38]:
nb.fit(xtrain_tfidf, y_train)
y_pred = nb.predict(xtrain_tfidf)
y_pred_test = nb.predict(xtest_tfidf)
y_score_test = nb.predict_proba(xtest_tfidf)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
"test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
"multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
"f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')
In [39]:
confusion_matrix(y_test,y_pred_test)
Out[39]:
In [40]:
tfidf_vect_bigram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,2), max_features=5000)
tfidf_vect_bigram.fit(complaints_df['consumer_complaint_narrative'])
xtrain_tfidf_bigram = tfidf_vect_bigram.transform(train_text)
xvalid_tfidf_bigram = tfidf_vect_bigram.transform(test_text)
In [41]:
X_train_tfidf_bi = scipy.sparse.hstack([xtrain_tfidf_bigram, X_train2])
X_test_tfidf_bi = scipy.sparse.hstack([xvalid_tfidf_bigram, X_test2])
In [42]:
clf.fit(X_train_tfidf_bi, y_train)
y_pred = clf.predict(X_train_tfidf_bi)
y_pred_test = clf.predict(X_test_tfidf_bi)
y_score_test = clf.predict_proba(X_test_tfidf_bi)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
"test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
"multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
"f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')
In [43]:
clf.fit(xtrain_tfidf_bigram, y_train)
y_pred = clf.predict(xtrain_tfidf_bigram)
y_pred_test = clf.predict(xvalid_tfidf_bigram)
y_score_test = clf.predict_proba(xvalid_tfidf_bigram)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
"test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
"multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
"f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')
In [44]:
confusion_matrix(y_test,y_pred_test)
Out[44]:
In [ ]:
nb.fit(X_train_tfidf_bi, y_train)
y_pred = nb.predict(X_train_tfidf_bi)
y_pred_test = nb.predict(X_test_tfidf_bi)
y_score_test = nb.predict_proba(X_test_tfidf_bi)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
"test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
"multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
"f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')
In [ ]:
nb.fit(xtrain_tfidf_bigram, y_train)
y_pred = nb.predict(xtrain_tfidf_bigram)
y_pred_test = nb.predict(xvalid_tfidf_bigram)
y_score_test = nb.predict_proba(xvalid_tfidf_bigram)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
"test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
"multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
"f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')
In [ ]:
confusion_matrix(y_test,y_pred_test)
Out[ ]:
In [10]:
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(complaints_df['consumer_complaint_narrative'])
xtrain_tfidf_ngram = tfidf_vect_ngram.transform(train_text)
xvalid_tfidf_ngram = tfidf_vect_ngram.transform(test_text)
In [13]:
X_train_tfidf_n = scipy.sparse.hstack([xtrain_tfidf_ngram, X_train2])
X_test_tfidf_n = scipy.sparse.hstack([xvalid_tfidf_ngram, X_test2])
In [50]:
clf.fit(xtrain_tfidf_ngram, y_train)
y_pred = clf.predict(xtrain_tfidf_ngram)
y_pred_test = clf.predict(xvalid_tfidf_ngram)
y_score_test = clf.predict_proba(xvalid_tfidf_ngram)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
"test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
"multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
"f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')
In [51]:
confusion_matrix(y_test,y_pred_test)
Out[51]:
In [14]:
clf.fit(xtrain_tfidf_ngram, y_train)
y_pred = clf.predict(xtrain_tfidf_ngram)
y_pred_test = clf.predict(xvalid_tfidf_ngram)
y_score_test = clf.predict_proba(xvalid_tfidf_ngram)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
"test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
"multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
"f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')
In [15]:
confusion_matrix(y_test,y_pred_test)
Out[15]:
In [18]:
nb.fit(X_train_tfidf_n, y_train)
y_pred = nb.predict(X_train_tfidf_n)
y_pred_test = nb.predict(X_test_tfidf_n)
y_score_test = nb.predict_proba(X_test_tfidf_n)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
"test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
"multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
"f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')
In [19]:
nb.fit(xtrain_tfidf_ngram, y_train)
y_pred = nb.predict(xtrain_tfidf_ngram)
y_pred_test = nb.predict(xvalid_tfidf_ngram)
y_score_test = nb.predict_proba(xvalid_tfidf_ngram)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
"test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
"multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
"f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')
In [20]:
confusion_matrix(y_test,y_pred_test)
Out[20]:
Link to the docs: https://cloud.google.com/natural-language/docs
The NLP-API added sentiment_score (of the consumer_complaint_narrative), mag (complaint sentiment magnitude), ent (extracted entities), ent types (the type of the extracted entities), entity_sent_scores (entities sentiment scores), entity_sent_mag (entities sentiment scores magnitude).
In [10]:
mlb = MultiLabelBinarizer(sparse_output=True)
complaints_df3 = complaints_df.copy()
In [11]:
complaints_df3.dropna(subset=['company_response_to_consumer'],inplace=True)
one_hot1 = pd.get_dummies(complaints_df3['product'], drop_first=True)
one_hot2 = pd.get_dummies(complaints_df3['subproduct'], drop_first=True)
one_hot3 = pd.get_dummies(complaints_df3['issue'], drop_first=True)
one_hot4 = pd.get_dummies(complaints_df3['subissue'], drop_first=True)
one_hot5 = pd.get_dummies(complaints_df3['state'], drop_first=True)
one_hot6 = pd.get_dummies(complaints_df3['zip_code'], drop_first=True)
one_hot7 = pd.get_dummies(complaints_df3['tags'], drop_first=True)
complaints_df3 = complaints_df3.join(one_hot1, rsuffix='prod')
complaints_df3 = complaints_df3.join(one_hot2, rsuffix='subprod')
complaints_df3 = complaints_df3.join(one_hot3, rsuffix='issue')
complaints_df3 = complaints_df3.join(one_hot4, rsuffix='subissue')
complaints_df3 = complaints_df3.join(one_hot5)
#complaints_df2 = complaints_df2.join(one_hot6)
complaints_df3 = complaints_df3.join(one_hot7)
In [12]:
def lister(l):
l = literal_eval(l)
double_list = [l]
return l
def check(l):
return any(isinstance(el, list) for el in l)
In [13]:
complaints_df3.dropna(subset=['entities'],inplace=True)
complaints_df3['entities2'] = complaints_df3['entities'].apply(lister)
In [14]:
variable = ['company_response_to_consumer',
'product',
'subproduct',
'issue',
'subissue',
'state',
'zip_code',
'tags',
'entities']
X = complaints_df3.drop(columns=variable)
y = complaints_df3['company_response_to_consumer']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)
In [15]:
#df['entities'] = df['entities'].apply(lister)
entities = mlb.fit(X['entities2'])
mlb.classes_
Out[15]:
In [16]:
X_train_ent = mlb.transform(X_train['entities2'])
X_test_ent = mlb.transform(X_test['entities2'])
In [21]:
parameters = {#'class_weight':[None,'balanced'],
#'max_depth':[2,4,6,8],
'n_estimators':[10,20,30],
'n_jobs':[-1]
}
rf = RandomForestClassifier()
clf = GridSearchCV(rf, parameters)
In [22]:
clf.fit(X_train_ent, y_train)
y_pred = clf.predict(X_train_ent)
y_pred_test = clf.predict(X_test_ent)
y_score_test = clf.predict_proba(X_test_ent)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
"test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
"multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
"f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')
In [20]:
nb.fit(X_train_ent, y_train)
y_pred = nb.predict(X_train_ent)
y_pred_test = nb.predict(X_test_ent)
y_score_test = nb.predict_proba(X_test_ent)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
"test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
"multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
"f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')
In [23]:
variables_to_drop = ['complaint_id',
'date_received',
'consumer_complaint_narrative',
'company_public_response',
'company_name',
'consumer_consent_provided',
'submitted_via',
'date_sent_to_company',
'timely_response','consumer_disputed',
'entity_types',
'entity_sentiment_scores',
'entity_sentiment_magnitudes',
'entities2']
new_x = X.drop(columns=variables_to_drop)
In [24]:
X_train, X_test, y_train, y_test = train_test_split(new_x, y, test_size=0.2, random_state=11)
In [25]:
clf.fit(X_train,y_train)
y_pred = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
y_score_test = clf.predict_proba(X_test)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
"test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
"multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
"f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')
In [26]:
nb.fit(X_train_ent, y_train)
y_pred = nb.predict(X_train_ent)
y_pred_test = nb.predict(X_test_ent)
y_score_test = nb.predict_proba(X_test_ent)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
"test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
"multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
"f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')
In [27]:
x_train_nlp = scipy.sparse.hstack([X_train_ent, X_train])
x_test_nlp = scipy.sparse.hstack([X_test_ent, X_test])
In [29]:
clf.fit(x_train_nlp,y_train)
y_pred = clf.predict(x_train_nlp)
y_pred_test = clf.predict(x_test_nlp)
y_score_test = clf.predict_proba(x_test_nlp)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
"test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
"multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
"f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')