In [17]:
from google.cloud import bigquery
import pandas
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import OneHotEncoder 
import pandas as pd
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import scipy
from sklearn.preprocessing import MultiLabelBinarizer
from ast import literal_eval

In [2]:
client = bigquery.Client(location="US")
print("Client creating using default project: {}".format(client.project))


Client creating using default project: ut-goog

This query runs off the same step used to make the clean complaints dataset (combine_tables.sql file output).


In [3]:
#query = """
#    SELECT 
#        *
#    FROM 
#        `ut-goog.nl_api.Karan_rolled_up_data`
#"""
#query_job = client.query(
#    query,
    # Location must match that of the dataset(s) referenced in the query.
#    location="US",
#)  # API request - starts the query

#complaints_df = query_job.to_dataframe()
#complaints_df.head()


Out[3]:
complaint_id date_received product subproduct issue subissue consumer_complaint_narrative company_public_response company_name state ... date_sent_to_company company_response_to_consumer timely_response consumer_disputed sentiment_score sentiment_magnitude entities entity_types entity_sentiment_scores entity_sentiment_magnitudes
0 3037829 2018-10-04 Debt collection Other debt Other Other ERC ONCE AGAIN WENT ON MY CREDIT REPORTS WITH ... Company can't verify or dispute the facts in t... ERC AR ... 2018-10-04 Closed with explanation True None 0.0 0.2 ['ERC', 'CREDIT REPORTS', 'VALIDATION', 'COLLE... ['ORGANIZATION', 'OTHER', 'OTHER', 'OTHER', 'O... [-0.20000000298023224, -0.10000000149011612, -... [0.20000000298023224, 0.10000000149011612, 0.3...
1 2114401 2016-09-15 Debt collection Other Other Other I have disputed this debt and asked them not t... Company believes it acted appropriately as aut... ERC CA ... 2016-09-15 Closed with non-monetary relief True False -0.1 0.3 ['debt', 'debt', 'letter', 'debt'] ['OTHER', 'OTHER', 'WORK_OF_ART', 'OTHER'] [-0.4000000059604645, 0, 0, 0] [0.4000000059604645, 0, 0, 0]
2 2890918 2018-04-30 Debt collection Other debt Other Other I am a victim of identity theft, company XXXX ... Company believes it acted appropriately as aut... ERC GA ... 2018-04-30 Closed with explanation True None -0.9 0.9 ['account', 'Account XXXX', 'XXXX XXXX XXXX', ... ['OTHER', 'EVENT', 'ORGANIZATION', 'OTHER', 'P... [-0.699999988079071, -0.800000011920929, -0.69... [1.399999976158142, 1.600000023841858, 1.5, 0....
3 1936042 2016-05-22 Debt collection Other Other Other I did n't have the money to pay the bills at t... Company believes it acted appropriately as aut... ERC VA ... 2016-05-26 Closed with non-monetary relief True False -0.1 0.3 ['money', 'bills', 'payment process'] ['OTHER', 'OTHER', 'OTHER'] [-0.20000000298023224, -0.20000000298023224, 0... [0.20000000298023224, 0.20000000298023224, 0.1...
4 3108556 2018-12-24 Debt collection I do not know Communication tactics Frequent or repeated calls The company ERC have been calling my phone eve... Company believes it acted appropriately as aut... ERC NC ... 2018-12-24 Closed with explanation True None -0.8 3.2 ['ERC', 'phone', 'times', 'number', 'XXXX XXXX... ['ORGANIZATION', 'CONSUMER_GOOD', 'EVENT', 'OT... [-0.5, -0.30000001192092896, -0.60000002384185... [3.299999952316284, 0.30000001192092896, 0.600...

5 rows × 24 columns

One Hot Encoding


In [4]:
complaints_df['product'] = complaints_df['product'].astype('category')
complaints_df['subproduct'] = complaints_df['subproduct'].astype('category')
complaints_df['issue'] = complaints_df['issue'].astype('category')
complaints_df['subissue'] = complaints_df['subissue'].astype('category')
complaints_df['state'] = complaints_df['state'].astype('category')
complaints_df['zip_code'] = complaints_df['zip_code'].astype('category')
complaints_df['tags'] = complaints_df['tags'].astype('category')

variables = ['company_response_to_consumer',
             'product',
             'subproduct',
             'issue',
             'subissue',
             'state',
             'zip_code',
             'tags',
             'consumer_complaint_narrative']
complaints_df2 = complaints_df.copy()
complaints_df2 = complaints_df2[variables]
complaints_df2.dropna(subset=['company_response_to_consumer'],inplace=True)

one_hot1 = pd.get_dummies(complaints_df2['product'], drop_first=True)
one_hot2 = pd.get_dummies(complaints_df2['subproduct'], drop_first=True)
one_hot3 = pd.get_dummies(complaints_df2['issue'], drop_first=True)
one_hot4 = pd.get_dummies(complaints_df2['subissue'], drop_first=True)
one_hot5 = pd.get_dummies(complaints_df2['state'], drop_first=True)
one_hot6 = pd.get_dummies(complaints_df2['zip_code'], drop_first=True)
one_hot7 = pd.get_dummies(complaints_df2['tags'], drop_first=True)

complaints_df2 = complaints_df2.join(one_hot1, rsuffix='prod')
complaints_df2 = complaints_df2.join(one_hot2, rsuffix='subprod')
complaints_df2 = complaints_df2.join(one_hot3, rsuffix='issue')
complaints_df2 = complaints_df2.join(one_hot4, rsuffix='subissue')
complaints_df2 = complaints_df2.join(one_hot5)
#complaints_df2 = complaints_df2.join(one_hot6)
complaints_df2 = complaints_df2.join(one_hot7)

variable = ['company_response_to_consumer','product','subproduct','issue','subissue','state','zip_code','tags']

X = complaints_df2.drop(columns=variable)
y = complaints_df2['company_response_to_consumer']
#labels = y.unique()
#y = preprocessing.label_binarize(y, classes=labels)

Random Forest: No Text


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)
X_train = X_train.drop(columns=['consumer_complaint_narrative'])
X_test = X_test.drop(columns=['consumer_complaint_narrative'])

In [6]:
parameters = {#'class_weight':[None,'balanced'],
              #'max_depth':[2,4,6,8],
              'n_estimators':[10,100,200,300],
              'n_jobs':[-1]
             }
rf = RandomForestClassifier()
clf = GridSearchCV(rf, parameters)
clf.get_params()


Out[6]:
{'cv': None,
 'error_score': 'raise',
 'estimator': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
             oob_score=False, random_state=None, verbose=0,
             warm_start=False),
 'estimator__bootstrap': True,
 'estimator__class_weight': None,
 'estimator__criterion': 'gini',
 'estimator__max_depth': None,
 'estimator__max_features': 'auto',
 'estimator__max_leaf_nodes': None,
 'estimator__min_impurity_decrease': 0.0,
 'estimator__min_impurity_split': None,
 'estimator__min_samples_leaf': 1,
 'estimator__min_samples_split': 2,
 'estimator__min_weight_fraction_leaf': 0.0,
 'estimator__n_estimators': 10,
 'estimator__n_jobs': 1,
 'estimator__oob_score': False,
 'estimator__random_state': None,
 'estimator__verbose': 0,
 'estimator__warm_start': False,
 'fit_params': None,
 'iid': True,
 'n_jobs': 1,
 'param_grid': {'n_estimators': [10, 100, 200, 300], 'n_jobs': [-1]},
 'pre_dispatch': '2*n_jobs',
 'refit': True,
 'return_train_score': 'warn',
 'scoring': None,
 'verbose': 0}

In [ ]:
clf.fit(X_train, y_train)

In [8]:
clf.best_params_


Out[8]:
{'n_estimators': 300, 'n_jobs': -1}

In [9]:
y_pred = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
y_score_test = clf.predict_proba(X_test)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n")


training accuracy: 0.8115 
test accuracy: 0.8049 


In [10]:
confusion_matrix(y_test,y_pred_test)


Out[10]:
array([[    0,   742,     2,     5,     0],
       [   11, 79781,   140,   318,    13],
       [    0,  5299,    52,    20,     0],
       [    2, 12205,    22,   132,     0],
       [    1,   594,     1,     3,     1]])

In [7]:
def multiclass_roc_auc_score(truth, pred, average="macro"):

    lb = LabelBinarizer()
    lb.fit(truth)

    truth = lb.transform(truth)
    pred = lb.transform(pred)

    return roc_auc_score(truth, pred, average=average)

In [12]:
print('multiclass auroc:',multiclass_roc_auc_score(y_test,y_pred_test))
print('f1 score:', f1_score(y_test, y_pred_test, average='weighted'))


multiclass auroc: 0.502264488460064
f1 score: 0.7242465439128403

Naive Bayes w/ no text


In [8]:
nb = MultinomialNB()

In [14]:
nb.fit(X_train, y_train)

y_pred = nb.predict(X_train)
y_pred_test = nb.predict(X_test)
y_score_test = nb.predict_proba(X_test)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
     "multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
     "f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')


training accuracy: 0.6832 
test accuracy: 0.6811 
multiclass aucroc: 0.5951 
f1 score: 0.696 


In [15]:
confusion_matrix(y_test,y_pred_test)


Out[15]:
array([[  204,   516,    20,     5,     4],
       [ 2757, 62344,  9314,  5655,   193],
       [   31,  2330,  3002,     7,     1],
       [  488,  8542,  1166,  2106,    59],
       [   45,   519,    17,    13,     6]])

Random Forest with traditional methods

Using counts


In [9]:
X_text = complaints_df2.drop(columns=variable)
X_train, X_test, y_train, y_test = train_test_split(X_text, y, test_size=0.2, random_state=11)
train_text = X_train['consumer_complaint_narrative']
test_text = X_test['consumer_complaint_narrative']

In [17]:
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(complaints_df['consumer_complaint_narrative'])

xtrain_count =  count_vect.transform(train_text)
xtest_count =  count_vect.transform(test_text)

In [18]:
xtrain_count


Out[18]:
<397373x132147 sparse matrix of type '<class 'numpy.int64'>'
	with 37102072 stored elements in Compressed Sparse Row format>

With Meta Data


In [12]:
X_train2 = X_train.drop(columns=['consumer_complaint_narrative'])
X_test2 = X_test.drop(columns=['consumer_complaint_narrative'])

In [20]:
X_train_counts = scipy.sparse.hstack([xtrain_count, X_train2])
X_test_counts = scipy.sparse.hstack([xtest_count, X_test2])

Note that using the grid search will work on the following random forests, but will take a long time per random forest (several hours).


In [21]:
#rfc = RandomForestClassifier(n_estimators = 100)

In [22]:
clf.fit(X_train_counts, y_train)
y_pred = clf.predict(X_train_counts)
y_pred_test = clf.predict(X_test_counts)
y_score_test = clf.predict_proba(X_test_counts)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
     "multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
     "f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')


training accuracy: 0.962 
test accuracy: 0.796 
multiclass aucroc: 0.5185 
f1 score: 0.7363 


In [23]:
clf.best_params_


Out[23]:
{'n_estimators': 10, 'n_jobs': -1}

Seems like it did not need many trees. Will reduce for the next ones.


In [9]:
parameters = {#'class_weight':[None,'balanced'],
              #'max_depth':[2,4,6,8],
              'n_estimators':[10,50,100],
              'n_jobs':[-1]
             }
rf = RandomForestClassifier()
clf = GridSearchCV(rf, parameters)

Just Text


In [25]:
clf.fit(xtrain_count, y_train)
y_pred = clf.predict(xtrain_count)
y_pred_test = clf.predict(xtest_count)
y_score_test = clf.predict_proba(xtest_count)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
    "multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
     "f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')


training accuracy: 0.9605 
test accuracy: 0.7959 
multiclass aucroc: 0.5143 
f1 score: 0.733 


In [26]:
confusion_matrix(y_test,y_pred_test)


Out[26]:
array([[   22,   723,     0,     4,     0],
       [   27, 77996,   125,  2071,    44],
       [    0,  5259,    94,    18,     0],
       [    1, 11387,    19,   950,     4],
       [    0,   583,     0,     8,     9]])

In [32]:
clf.best_params_


Out[32]:
{'n_estimators': 10, 'n_jobs': -1}

Naive Bayes Using Counts

Counts + meta


In [27]:
nb.fit(X_train_counts, y_train)
y_pred = nb.predict(X_train_counts)
y_pred_test = nb.predict(X_test_counts)
y_score_test = nb.predict_proba(X_test_counts)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
     "multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
     "f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')


training accuracy: 0.5364 
test accuracy: 0.5282 
multiclass aucroc: 0.6309 
f1 score: 0.5809 

Just counts


In [28]:
nb.fit(xtrain_count, y_train)
y_pred = nb.predict(xtrain_count)
y_pred_test = nb.predict(xtest_count)
y_score_test = nb.predict_proba(xtest_count)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
      "multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
     "f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')


training accuracy: 0.5661 
test accuracy: 0.5576 
multiclass aucroc: 0.6222 
f1 score: 0.6078 


In [29]:
confusion_matrix(y_test,y_pred_test)


Out[29]:
array([[   26,   515,    79,   110,    19],
       [   91, 44505, 12192, 23193,   282],
       [    1,  1017,  4246,   103,     4],
       [    4,  4363,  1374,  6583,    37],
       [    0,   432,    41,    90,    37]])

Random Forest/Naive Bayes Using TF-IDF

unigram-tf-idf: random forest


In [30]:
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(complaints_df['consumer_complaint_narrative'])
xtrain_tfidf =  tfidf_vect.transform(train_text)
xtest_tfidf =  tfidf_vect.transform(test_text)

In [31]:
X_train_tfidf = scipy.sparse.hstack([xtrain_tfidf, X_train2])
X_test_tfidf = scipy.sparse.hstack([xtest_tfidf, X_test2])

With text and meta


In [33]:
clf.fit(X_train_tfidf, y_train)
y_pred = clf.predict(X_train_tfidf)
y_pred_test = clf.predict(X_test_tfidf)
y_score_test = clf.predict_proba(X_test_tfidf)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
     "multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
     "f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')


training accuracy: 0.9848 
test accuracy: 0.7991 
multiclass aucroc: 0.5172 
f1 score: 0.7367 


In [34]:
confusion_matrix(y_test,y_pred_test)


Out[34]:
array([[   25,   719,     0,     5,     0],
       [   22, 78174,    86,  1933,    48],
       [    0,  5227,   124,    20,     0],
       [    1, 11292,    15,  1050,     3],
       [    0,   582,     0,     9,     9]])

Just text


In [35]:
clf.fit(xtrain_tfidf, y_train)
y_pred = clf.predict(xtrain_tfidf)
y_pred_test = clf.predict(xtest_tfidf)
y_score_test = clf.predict_proba(xtest_tfidf)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
      "multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
     "f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')


training accuracy: 0.9838 
test accuracy: 0.7984 
multiclass aucroc: 0.5153 
f1 score: 0.7344 


In [36]:
confusion_matrix(y_test,y_pred_test)


Out[36]:
array([[   26,   718,     0,     5,     0],
       [   22, 78244,    79,  1875,    43],
       [    0,  5263,    90,    18,     0],
       [    2, 11396,    12,   947,     4],
       [    0,   583,     0,     8,     9]])

unigram-tf-idf: naive bayes

Text and meta


In [37]:
nb.fit(X_train_tfidf, y_train)
y_pred = nb.predict(X_train_tfidf)
y_pred_test = nb.predict(X_test_tfidf)
y_score_test = nb.predict_proba(X_test_tfidf)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
     "multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
     "f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')


training accuracy: 0.6398 
test accuracy: 0.636 
multiclass aucroc: 0.636 
f1 score: 0.6734 

Just with the text


In [38]:
nb.fit(xtrain_tfidf, y_train)
y_pred = nb.predict(xtrain_tfidf)
y_pred_test = nb.predict(xtest_tfidf)
y_score_test = nb.predict_proba(xtest_tfidf)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
     "multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
     "f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')


training accuracy: 0.8078 
test accuracy: 0.8068 
multiclass aucroc: 0.5191 
f1 score: 0.7361 

/usr/local/lib/python3.5/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

In [39]:
confusion_matrix(y_test,y_pred_test)


Out[39]:
array([[   15,   728,     3,     3,     0],
       [    0, 79261,   689,   313,     0],
       [    0,  4698,   673,     0,     0],
       [    0, 12029,   128,   204,     0],
       [    0,   600,     0,     0,     0]])

bi-grams: random forest


In [40]:
tfidf_vect_bigram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,2), max_features=5000)
tfidf_vect_bigram.fit(complaints_df['consumer_complaint_narrative'])
xtrain_tfidf_bigram =  tfidf_vect_bigram.transform(train_text)
xvalid_tfidf_bigram =  tfidf_vect_bigram.transform(test_text)

In [41]:
X_train_tfidf_bi = scipy.sparse.hstack([xtrain_tfidf_bigram, X_train2])
X_test_tfidf_bi = scipy.sparse.hstack([xvalid_tfidf_bigram, X_test2])

Text and Meta Data


In [42]:
clf.fit(X_train_tfidf_bi, y_train)
y_pred = clf.predict(X_train_tfidf_bi)
y_pred_test = clf.predict(X_test_tfidf_bi)
y_score_test = clf.predict_proba(X_test_tfidf_bi)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
     "multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
     "f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')


training accuracy: 0.9841 
test accuracy: 0.7969 
multiclass aucroc: 0.5166 
f1 score: 0.7353 

Just text


In [43]:
clf.fit(xtrain_tfidf_bigram, y_train)
y_pred = clf.predict(xtrain_tfidf_bigram)
y_pred_test = clf.predict(xvalid_tfidf_bigram)
y_score_test = clf.predict_proba(xvalid_tfidf_bigram)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
     "multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
     "f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')


training accuracy: 0.9826 
test accuracy: 0.7963 
multiclass aucroc: 0.5138 
f1 score: 0.7326 


In [44]:
confusion_matrix(y_test,y_pred_test)


Out[44]:
array([[   20,   727,     0,     2,     0],
       [   30, 78067,    82,  2032,    52],
       [    0,  5271,    90,    10,     0],
       [    1, 11433,     3,   921,     3],
       [    0,   579,     0,    11,    10]])

bi-grams: naive bayes


In [ ]:
nb.fit(X_train_tfidf_bi, y_train)
y_pred = nb.predict(X_train_tfidf_bi)
y_pred_test = nb.predict(X_test_tfidf_bi)
y_score_test = nb.predict_proba(X_test_tfidf_bi)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
     "multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
     "f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')


training accuracy: 0.6049 
test accuracy: 0.6032 
multiclass aucroc: 0.6421 
f1 score: 0.65 

Just text


In [ ]:
nb.fit(xtrain_tfidf_bigram, y_train)
y_pred = nb.predict(xtrain_tfidf_bigram)
y_pred_test = nb.predict(xvalid_tfidf_bigram)
y_score_test = nb.predict_proba(xvalid_tfidf_bigram)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
     "multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
     "f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')


training accuracy: 0.7989 
test accuracy: 0.7985 
multiclass aucroc: 0.5322 
f1 score: 0.7422 

/usr/local/lib/python3.5/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

In [ ]:
confusion_matrix(y_test,y_pred_test)


Out[ ]:
array([[    0,   735,    11,     3,     0],
       [    0, 77511,  1673,  1067,    12],
       [    0,  4102,  1268,     1,     0],
       [    0, 11641,   173,   547,     0],
       [    0,   597,     0,     1,     2]])

bi/tri grams: random forest


In [10]:
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(complaints_df['consumer_complaint_narrative'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_text)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(test_text)

In [13]:
X_train_tfidf_n = scipy.sparse.hstack([xtrain_tfidf_ngram, X_train2])
X_test_tfidf_n = scipy.sparse.hstack([xvalid_tfidf_ngram, X_test2])

with meta and text data


In [50]:
clf.fit(xtrain_tfidf_ngram, y_train)
y_pred = clf.predict(xtrain_tfidf_ngram)
y_pred_test = clf.predict(xvalid_tfidf_ngram)
y_score_test = clf.predict_proba(xvalid_tfidf_ngram)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
     "multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
     "f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')


training accuracy: 0.9824 
test accuracy: 0.7972 
multiclass aucroc: 0.5137 
f1 score: 0.7329 


In [51]:
confusion_matrix(y_test,y_pred_test)


Out[51]:
array([[   18,   727,     1,     3,     0],
       [   24, 78178,    80,  1933,    48],
       [    0,  5260,   101,    10,     0],
       [    1, 11460,     6,   891,     3],
       [    0,   580,     0,    10,    10]])

Just text


In [14]:
clf.fit(xtrain_tfidf_ngram, y_train)
y_pred = clf.predict(xtrain_tfidf_ngram)
y_pred_test = clf.predict(xvalid_tfidf_ngram)
y_score_test = clf.predict_proba(xvalid_tfidf_ngram)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
     "multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
     "f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')


training accuracy: 0.9823 
test accuracy: 0.7974 
multiclass aucroc: 0.5135 
f1 score: 0.7328 


In [15]:
confusion_matrix(y_test,y_pred_test)


Out[15]:
array([[   20,   725,     1,     3,     0],
       [   21, 78207,    78,  1911,    46],
       [    0,  5275,    91,     5,     0],
       [    1, 11466,     3,   888,     3],
       [    0,   582,     0,     9,     9]])

bi/tri grams: naive bayes

Text + Meta


In [18]:
nb.fit(X_train_tfidf_n, y_train)
y_pred = nb.predict(X_train_tfidf_n)
y_pred_test = nb.predict(X_test_tfidf_n)
y_score_test = nb.predict_proba(X_test_tfidf_n)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
     "multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
     "f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')


training accuracy: 0.5945 
test accuracy: 0.592 
multiclass aucroc: 0.6416 
f1 score: 0.6414 

Just text


In [19]:
nb.fit(xtrain_tfidf_ngram, y_train)
y_pred = nb.predict(xtrain_tfidf_ngram)
y_pred_test = nb.predict(xvalid_tfidf_ngram)
y_score_test = nb.predict_proba(xvalid_tfidf_ngram)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
     "multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
     "f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')


training accuracy: 0.7892 
test accuracy: 0.7892 
multiclass aucroc: 0.5392 
f1 score: 0.7426 

/usr/local/lib/python3.5/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

In [20]:
confusion_matrix(y_test,y_pred_test)


Out[20]:
array([[    0,   728,    16,     5,     0],
       [    0, 76042,  2412,  1794,    15],
       [    0,  3868,  1501,     2,     0],
       [    0, 11244,   259,   858,     0],
       [    0,   593,     2,     1,     4]])

Using data from NLP-API

Link to the docs: https://cloud.google.com/natural-language/docs

The NLP-API added sentiment_score (of the consumer_complaint_narrative), mag (complaint sentiment magnitude), ent (extracted entities), ent types (the type of the extracted entities), entity_sent_scores (entities sentiment scores), entity_sent_mag (entities sentiment scores magnitude).


In [10]:
mlb = MultiLabelBinarizer(sparse_output=True)
complaints_df3 = complaints_df.copy()

In [11]:
complaints_df3.dropna(subset=['company_response_to_consumer'],inplace=True)

one_hot1 = pd.get_dummies(complaints_df3['product'], drop_first=True)
one_hot2 = pd.get_dummies(complaints_df3['subproduct'], drop_first=True)
one_hot3 = pd.get_dummies(complaints_df3['issue'], drop_first=True)
one_hot4 = pd.get_dummies(complaints_df3['subissue'], drop_first=True)
one_hot5 = pd.get_dummies(complaints_df3['state'], drop_first=True)
one_hot6 = pd.get_dummies(complaints_df3['zip_code'], drop_first=True)
one_hot7 = pd.get_dummies(complaints_df3['tags'], drop_first=True)

complaints_df3 = complaints_df3.join(one_hot1, rsuffix='prod')
complaints_df3 = complaints_df3.join(one_hot2, rsuffix='subprod')
complaints_df3 = complaints_df3.join(one_hot3, rsuffix='issue')
complaints_df3 = complaints_df3.join(one_hot4, rsuffix='subissue')
complaints_df3 = complaints_df3.join(one_hot5)
#complaints_df2 = complaints_df2.join(one_hot6)
complaints_df3 = complaints_df3.join(one_hot7)

In [12]:
def lister(l):
    l = literal_eval(l)
    double_list = [l]
    return l
def check(l):
    return any(isinstance(el, list) for el in l)

In [13]:
complaints_df3.dropna(subset=['entities'],inplace=True)
complaints_df3['entities2'] = complaints_df3['entities'].apply(lister)

In [14]:
variable = ['company_response_to_consumer',
            'product',
            'subproduct',
            'issue',
            'subissue',
            'state',
            'zip_code',
            'tags',
            'entities']

X = complaints_df3.drop(columns=variable)
y = complaints_df3['company_response_to_consumer']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)

In [15]:
#df['entities'] = df['entities'].apply(lister)
entities = mlb.fit(X['entities2'])
mlb.classes_


Out[15]:
array(['! II', '#', '# #', ...,
       '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Student Aid Feedback XXXX To XXXX XX/XX/XXXX',
       '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ XXXX XXXX XXXX To XXXX XX/XX/XXXX',
       '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Apparently'],
      dtype=object)

In [16]:
X_train_ent = mlb.transform(X_train['entities2'])
X_test_ent = mlb.transform(X_test['entities2'])

Just entities

Random forest

Just with the entity data


In [21]:
parameters = {#'class_weight':[None,'balanced'],
              #'max_depth':[2,4,6,8],
              'n_estimators':[10,20,30],
              'n_jobs':[-1]
             }
rf = RandomForestClassifier()
clf = GridSearchCV(rf, parameters)

In [22]:
clf.fit(X_train_ent, y_train)
y_pred = clf.predict(X_train_ent)
y_pred_test = clf.predict(X_test_ent)
y_score_test = clf.predict_proba(X_test_ent)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
     "multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
     "f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')


training accuracy: 0.9795 
test accuracy: 0.8002 
multiclass aucroc: 0.5144 
f1 score: 0.7352 

Naive bayes

Just with the entity data


In [20]:
nb.fit(X_train_ent, y_train)
y_pred = nb.predict(X_train_ent)
y_pred_test = nb.predict(X_test_ent)
y_score_test = nb.predict_proba(X_test_ent)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
     "multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
     "f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')


training accuracy: 0.8152 
test accuracy: 0.7831 
multiclass aucroc: 0.5318 
f1 score: 0.741 

Using both sentiment and meta data


In [23]:
variables_to_drop = ['complaint_id',
                     'date_received',
                     'consumer_complaint_narrative',
                     'company_public_response',
                     'company_name',
                    'consumer_consent_provided',
                     'submitted_via',
                     'date_sent_to_company',
                     'timely_response','consumer_disputed',
                    'entity_types',
                     'entity_sentiment_scores',
                     'entity_sentiment_magnitudes',
                     'entities2']
new_x = X.drop(columns=variables_to_drop)

In [24]:
X_train, X_test, y_train, y_test = train_test_split(new_x, y, test_size=0.2, random_state=11)

In [25]:
clf.fit(X_train,y_train)
y_pred = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
y_score_test = clf.predict_proba(X_test)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
     "multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
     "f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')


training accuracy: 0.9265 
test accuracy: 0.7529 
multiclass aucroc: 0.5251 
f1 score: 0.7218 


In [26]:
nb.fit(X_train_ent, y_train)
y_pred = nb.predict(X_train_ent)
y_pred_test = nb.predict(X_test_ent)
y_score_test = nb.predict_proba(X_test_ent)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
     "multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
     "f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')


training accuracy: 0.8152 
test accuracy: 0.7831 
multiclass aucroc: 0.5318 
f1 score: 0.741 

Entites with meta data now


In [27]:
x_train_nlp = scipy.sparse.hstack([X_train_ent, X_train])
x_test_nlp = scipy.sparse.hstack([X_test_ent, X_test])

In [29]:
clf.fit(x_train_nlp,y_train)
y_pred = clf.predict(x_train_nlp)
y_pred_test = clf.predict(x_test_nlp)
y_score_test = clf.predict_proba(x_test_nlp)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
     "multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
     "f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')


training accuracy: 0.9822 
test accuracy: 0.7995 
multiclass aucroc: 0.5144 
f1 score: 0.7348