In [17]:

    
from google.cloud import bigquery
import pandas
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import OneHotEncoder 
import pandas as pd
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import scipy
from sklearn.preprocessing import MultiLabelBinarizer
from ast import literal_eval



In [2]:

    
client = bigquery.Client(location="US")
print("Client creating using default project: {}".format(client.project))









    



Client creating using default project: ut-goog

This query runs off the same step used to make the clean complaints dataset (combine_tables.sql file output).



In [3]:

    
#query = """
#    SELECT 
#        *
#    FROM 
#        `ut-goog.nl_api.Karan_rolled_up_data`
#"""
#query_job = client.query(
#    query,
    # Location must match that of the dataset(s) referenced in the query.
#    location="US",
#)  # API request - starts the query

#complaints_df = query_job.to_dataframe()
#complaints_df.head()









    Out[3]:







  
    
      
      complaint_id
      date_received
      product
      subproduct
      issue
      subissue
      consumer_complaint_narrative
      company_public_response
      company_name
      state
      ...
      date_sent_to_company
      company_response_to_consumer
      timely_response
      consumer_disputed
      sentiment_score
      sentiment_magnitude
      entities
      entity_types
      entity_sentiment_scores
      entity_sentiment_magnitudes
    
  
  
    
      0
      3037829
      2018-10-04
      Debt collection
      Other debt
      Other
      Other
      ERC ONCE AGAIN WENT ON MY CREDIT REPORTS WITH ...
      Company can't verify or dispute the facts in t...
      ERC
      AR
      ...
      2018-10-04
      Closed with explanation
      True
      None
      0.0
      0.2
      ['ERC', 'CREDIT REPORTS', 'VALIDATION', 'COLLE...
      ['ORGANIZATION', 'OTHER', 'OTHER', 'OTHER', 'O...
      [-0.20000000298023224, -0.10000000149011612, -...
      [0.20000000298023224, 0.10000000149011612, 0.3...
    
    
      1
      2114401
      2016-09-15
      Debt collection
      Other
      Other
      Other
      I have disputed this debt and asked them not t...
      Company believes it acted appropriately as aut...
      ERC
      CA
      ...
      2016-09-15
      Closed with non-monetary relief
      True
      False
      -0.1
      0.3
      ['debt', 'debt', 'letter', 'debt']
      ['OTHER', 'OTHER', 'WORK_OF_ART', 'OTHER']
      [-0.4000000059604645, 0, 0, 0]
      [0.4000000059604645, 0, 0, 0]
    
    
      2
      2890918
      2018-04-30
      Debt collection
      Other debt
      Other
      Other
      I am a victim of identity theft, company XXXX ...
      Company believes it acted appropriately as aut...
      ERC
      GA
      ...
      2018-04-30
      Closed with explanation
      True
      None
      -0.9
      0.9
      ['account', 'Account XXXX', 'XXXX XXXX XXXX', ...
      ['OTHER', 'EVENT', 'ORGANIZATION', 'OTHER', 'P...
      [-0.699999988079071, -0.800000011920929, -0.69...
      [1.399999976158142, 1.600000023841858, 1.5, 0....
    
    
      3
      1936042
      2016-05-22
      Debt collection
      Other
      Other
      Other
      I did n't have the money to pay the bills at t...
      Company believes it acted appropriately as aut...
      ERC
      VA
      ...
      2016-05-26
      Closed with non-monetary relief
      True
      False
      -0.1
      0.3
      ['money', 'bills', 'payment process']
      ['OTHER', 'OTHER', 'OTHER']
      [-0.20000000298023224, -0.20000000298023224, 0...
      [0.20000000298023224, 0.20000000298023224, 0.1...
    
    
      4
      3108556
      2018-12-24
      Debt collection
      I do not know
      Communication tactics
      Frequent or repeated calls
      The company ERC have been calling my phone eve...
      Company believes it acted appropriately as aut...
      ERC
      NC
      ...
      2018-12-24
      Closed with explanation
      True
      None
      -0.8
      3.2
      ['ERC', 'phone', 'times', 'number', 'XXXX XXXX...
      ['ORGANIZATION', 'CONSUMER_GOOD', 'EVENT', 'OT...
      [-0.5, -0.30000001192092896, -0.60000002384185...
      [3.299999952316284, 0.30000001192092896, 0.600...
    
  

5 rows × 24 columns

One Hot Encoding



In [4]:

    
complaints_df['product'] = complaints_df['product'].astype('category')
complaints_df['subproduct'] = complaints_df['subproduct'].astype('category')
complaints_df['issue'] = complaints_df['issue'].astype('category')
complaints_df['subissue'] = complaints_df['subissue'].astype('category')
complaints_df['state'] = complaints_df['state'].astype('category')
complaints_df['zip_code'] = complaints_df['zip_code'].astype('category')
complaints_df['tags'] = complaints_df['tags'].astype('category')

variables = ['company_response_to_consumer',
             'product',
             'subproduct',
             'issue',
             'subissue',
             'state',
             'zip_code',
             'tags',
             'consumer_complaint_narrative']
complaints_df2 = complaints_df.copy()
complaints_df2 = complaints_df2[variables]
complaints_df2.dropna(subset=['company_response_to_consumer'],inplace=True)

one_hot1 = pd.get_dummies(complaints_df2['product'], drop_first=True)
one_hot2 = pd.get_dummies(complaints_df2['subproduct'], drop_first=True)
one_hot3 = pd.get_dummies(complaints_df2['issue'], drop_first=True)
one_hot4 = pd.get_dummies(complaints_df2['subissue'], drop_first=True)
one_hot5 = pd.get_dummies(complaints_df2['state'], drop_first=True)
one_hot6 = pd.get_dummies(complaints_df2['zip_code'], drop_first=True)
one_hot7 = pd.get_dummies(complaints_df2['tags'], drop_first=True)

complaints_df2 = complaints_df2.join(one_hot1, rsuffix='prod')
complaints_df2 = complaints_df2.join(one_hot2, rsuffix='subprod')
complaints_df2 = complaints_df2.join(one_hot3, rsuffix='issue')
complaints_df2 = complaints_df2.join(one_hot4, rsuffix='subissue')
complaints_df2 = complaints_df2.join(one_hot5)
#complaints_df2 = complaints_df2.join(one_hot6)
complaints_df2 = complaints_df2.join(one_hot7)

variable = ['company_response_to_consumer','product','subproduct','issue','subissue','state','zip_code','tags']

X = complaints_df2.drop(columns=variable)
y = complaints_df2['company_response_to_consumer']
#labels = y.unique()
#y = preprocessing.label_binarize(y, classes=labels)

Random Forest: No Text



In [5]:

    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)
X_train = X_train.drop(columns=['consumer_complaint_narrative'])
X_test = X_test.drop(columns=['consumer_complaint_narrative'])



In [6]:

    
parameters = {#'class_weight':[None,'balanced'],
              #'max_depth':[2,4,6,8],
              'n_estimators':[10,100,200,300],
              'n_jobs':[-1]
             }
rf = RandomForestClassifier()
clf = GridSearchCV(rf, parameters)
clf.get_params()









    Out[6]:





{'cv': None,
 'error_score': 'raise',
 'estimator': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
             oob_score=False, random_state=None, verbose=0,
             warm_start=False),
 'estimator__bootstrap': True,
 'estimator__class_weight': None,
 'estimator__criterion': 'gini',
 'estimator__max_depth': None,
 'estimator__max_features': 'auto',
 'estimator__max_leaf_nodes': None,
 'estimator__min_impurity_decrease': 0.0,
 'estimator__min_impurity_split': None,
 'estimator__min_samples_leaf': 1,
 'estimator__min_samples_split': 2,
 'estimator__min_weight_fraction_leaf': 0.0,
 'estimator__n_estimators': 10,
 'estimator__n_jobs': 1,
 'estimator__oob_score': False,
 'estimator__random_state': None,
 'estimator__verbose': 0,
 'estimator__warm_start': False,
 'fit_params': None,
 'iid': True,
 'n_jobs': 1,
 'param_grid': {'n_estimators': [10, 100, 200, 300], 'n_jobs': [-1]},
 'pre_dispatch': '2*n_jobs',
 'refit': True,
 'return_train_score': 'warn',
 'scoring': None,
 'verbose': 0}



In [ ]:

    
clf.fit(X_train, y_train)



In [8]:

    
clf.best_params_









    Out[8]:





{'n_estimators': 300, 'n_jobs': -1}



In [9]:

    
y_pred = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
y_score_test = clf.predict_proba(X_test)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n")









    



training accuracy: 0.8115 
test accuracy: 0.8049



In [10]:

    
confusion_matrix(y_test,y_pred_test)









    Out[10]:





array([[    0,   742,     2,     5,     0],
       [   11, 79781,   140,   318,    13],
       [    0,  5299,    52,    20,     0],
       [    2, 12205,    22,   132,     0],
       [    1,   594,     1,     3,     1]])



In [7]:

    
def multiclass_roc_auc_score(truth, pred, average="macro"):

    lb = LabelBinarizer()
    lb.fit(truth)

    truth = lb.transform(truth)
    pred = lb.transform(pred)

    return roc_auc_score(truth, pred, average=average)



In [12]:

    
print('multiclass auroc:',multiclass_roc_auc_score(y_test,y_pred_test))
print('f1 score:', f1_score(y_test, y_pred_test, average='weighted'))









    



multiclass auroc: 0.502264488460064
f1 score: 0.7242465439128403

Naive Bayes w/ no text



In [8]:

    
nb = MultinomialNB()



In [14]:

    
nb.fit(X_train, y_train)

y_pred = nb.predict(X_train)
y_pred_test = nb.predict(X_test)
y_score_test = nb.predict_proba(X_test)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
     "multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
     "f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')









    



training accuracy: 0.6832 
test accuracy: 0.6811 
multiclass aucroc: 0.5951 
f1 score: 0.696



In [15]:

    
confusion_matrix(y_test,y_pred_test)









    Out[15]:





array([[  204,   516,    20,     5,     4],
       [ 2757, 62344,  9314,  5655,   193],
       [   31,  2330,  3002,     7,     1],
       [  488,  8542,  1166,  2106,    59],
       [   45,   519,    17,    13,     6]])

Random Forest with traditional methods

Using counts



In [9]:

    
X_text = complaints_df2.drop(columns=variable)
X_train, X_test, y_train, y_test = train_test_split(X_text, y, test_size=0.2, random_state=11)
train_text = X_train['consumer_complaint_narrative']
test_text = X_test['consumer_complaint_narrative']



In [17]:

    
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(complaints_df['consumer_complaint_narrative'])

xtrain_count =  count_vect.transform(train_text)
xtest_count =  count_vect.transform(test_text)



In [18]:

    
xtrain_count









    Out[18]:





<397373x132147 sparse matrix of type '<class 'numpy.int64'>'
	with 37102072 stored elements in Compressed Sparse Row format>

With Meta Data



In [12]:

    
X_train2 = X_train.drop(columns=['consumer_complaint_narrative'])
X_test2 = X_test.drop(columns=['consumer_complaint_narrative'])



In [20]:

    
X_train_counts = scipy.sparse.hstack([xtrain_count, X_train2])
X_test_counts = scipy.sparse.hstack([xtest_count, X_test2])

Note that using the grid search will work on the following random forests, but will take a long time per random forest (several hours).



In [21]:

    
#rfc = RandomForestClassifier(n_estimators = 100)



In [22]:

    
clf.fit(X_train_counts, y_train)
y_pred = clf.predict(X_train_counts)
y_pred_test = clf.predict(X_test_counts)
y_score_test = clf.predict_proba(X_test_counts)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
     "multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
     "f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')









    



training accuracy: 0.962 
test accuracy: 0.796 
multiclass aucroc: 0.5185 
f1 score: 0.7363



In [23]:

    
clf.best_params_









    Out[23]:





{'n_estimators': 10, 'n_jobs': -1}

Seems like it did not need many trees. Will reduce for the next ones.



In [9]:

    
parameters = {#'class_weight':[None,'balanced'],
              #'max_depth':[2,4,6,8],
              'n_estimators':[10,50,100],
              'n_jobs':[-1]
             }
rf = RandomForestClassifier()
clf = GridSearchCV(rf, parameters)

Just Text



In [25]:

    
clf.fit(xtrain_count, y_train)
y_pred = clf.predict(xtrain_count)
y_pred_test = clf.predict(xtest_count)
y_score_test = clf.predict_proba(xtest_count)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
    "multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
     "f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')









    



training accuracy: 0.9605 
test accuracy: 0.7959 
multiclass aucroc: 0.5143 
f1 score: 0.733



In [26]:

    
confusion_matrix(y_test,y_pred_test)









    Out[26]:





array([[   22,   723,     0,     4,     0],
       [   27, 77996,   125,  2071,    44],
       [    0,  5259,    94,    18,     0],
       [    1, 11387,    19,   950,     4],
       [    0,   583,     0,     8,     9]])



In [32]:

    
clf.best_params_









    Out[32]:





{'n_estimators': 10, 'n_jobs': -1}

Naive Bayes Using Counts

Counts + meta



In [27]:

    
nb.fit(X_train_counts, y_train)
y_pred = nb.predict(X_train_counts)
y_pred_test = nb.predict(X_test_counts)
y_score_test = nb.predict_proba(X_test_counts)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
     "multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
     "f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')









    



training accuracy: 0.5364 
test accuracy: 0.5282 
multiclass aucroc: 0.6309 
f1 score: 0.5809

Just counts



In [28]:

    
nb.fit(xtrain_count, y_train)
y_pred = nb.predict(xtrain_count)
y_pred_test = nb.predict(xtest_count)
y_score_test = nb.predict_proba(xtest_count)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
      "multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
     "f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')









    



training accuracy: 0.5661 
test accuracy: 0.5576 
multiclass aucroc: 0.6222 
f1 score: 0.6078



In [29]:

    
confusion_matrix(y_test,y_pred_test)









    Out[29]:





array([[   26,   515,    79,   110,    19],
       [   91, 44505, 12192, 23193,   282],
       [    1,  1017,  4246,   103,     4],
       [    4,  4363,  1374,  6583,    37],
       [    0,   432,    41,    90,    37]])

Random Forest/Naive Bayes Using TF-IDF

unigram-tf-idf: random forest



In [30]:

    
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(complaints_df['consumer_complaint_narrative'])
xtrain_tfidf =  tfidf_vect.transform(train_text)
xtest_tfidf =  tfidf_vect.transform(test_text)



In [31]:

    
X_train_tfidf = scipy.sparse.hstack([xtrain_tfidf, X_train2])
X_test_tfidf = scipy.sparse.hstack([xtest_tfidf, X_test2])

With text and meta



In [33]:

    
clf.fit(X_train_tfidf, y_train)
y_pred = clf.predict(X_train_tfidf)
y_pred_test = clf.predict(X_test_tfidf)
y_score_test = clf.predict_proba(X_test_tfidf)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
     "multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
     "f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')









    



training accuracy: 0.9848 
test accuracy: 0.7991 
multiclass aucroc: 0.5172 
f1 score: 0.7367



In [34]:

    
confusion_matrix(y_test,y_pred_test)









    Out[34]:





array([[   25,   719,     0,     5,     0],
       [   22, 78174,    86,  1933,    48],
       [    0,  5227,   124,    20,     0],
       [    1, 11292,    15,  1050,     3],
       [    0,   582,     0,     9,     9]])

Just text



In [35]:

    
clf.fit(xtrain_tfidf, y_train)
y_pred = clf.predict(xtrain_tfidf)
y_pred_test = clf.predict(xtest_tfidf)
y_score_test = clf.predict_proba(xtest_tfidf)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
      "multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
     "f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')









    



training accuracy: 0.9838 
test accuracy: 0.7984 
multiclass aucroc: 0.5153 
f1 score: 0.7344



In [36]:

    
confusion_matrix(y_test,y_pred_test)









    Out[36]:





array([[   26,   718,     0,     5,     0],
       [   22, 78244,    79,  1875,    43],
       [    0,  5263,    90,    18,     0],
       [    2, 11396,    12,   947,     4],
       [    0,   583,     0,     8,     9]])

unigram-tf-idf: naive bayes

Text and meta



In [37]:

    
nb.fit(X_train_tfidf, y_train)
y_pred = nb.predict(X_train_tfidf)
y_pred_test = nb.predict(X_test_tfidf)
y_score_test = nb.predict_proba(X_test_tfidf)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
     "multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
     "f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')









    



training accuracy: 0.6398 
test accuracy: 0.636 
multiclass aucroc: 0.636 
f1 score: 0.6734

Just with the text



In [38]:

    
nb.fit(xtrain_tfidf, y_train)
y_pred = nb.predict(xtrain_tfidf)
y_pred_test = nb.predict(xtest_tfidf)
y_score_test = nb.predict_proba(xtest_tfidf)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
     "multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
     "f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')









    



training accuracy: 0.8078 
test accuracy: 0.8068 
multiclass aucroc: 0.5191 
f1 score: 0.7361 







    



/usr/local/lib/python3.5/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)



In [39]:

    
confusion_matrix(y_test,y_pred_test)









    Out[39]:





array([[   15,   728,     3,     3,     0],
       [    0, 79261,   689,   313,     0],
       [    0,  4698,   673,     0,     0],
       [    0, 12029,   128,   204,     0],
       [    0,   600,     0,     0,     0]])

bi-grams: random forest



In [40]:

    
tfidf_vect_bigram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,2), max_features=5000)
tfidf_vect_bigram.fit(complaints_df['consumer_complaint_narrative'])
xtrain_tfidf_bigram =  tfidf_vect_bigram.transform(train_text)
xvalid_tfidf_bigram =  tfidf_vect_bigram.transform(test_text)



In [41]:

    
X_train_tfidf_bi = scipy.sparse.hstack([xtrain_tfidf_bigram, X_train2])
X_test_tfidf_bi = scipy.sparse.hstack([xvalid_tfidf_bigram, X_test2])

Text and Meta Data



In [42]:

    
clf.fit(X_train_tfidf_bi, y_train)
y_pred = clf.predict(X_train_tfidf_bi)
y_pred_test = clf.predict(X_test_tfidf_bi)
y_score_test = clf.predict_proba(X_test_tfidf_bi)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
     "multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
     "f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')









    



training accuracy: 0.9841 
test accuracy: 0.7969 
multiclass aucroc: 0.5166 
f1 score: 0.7353

Just text



In [43]:

    
clf.fit(xtrain_tfidf_bigram, y_train)
y_pred = clf.predict(xtrain_tfidf_bigram)
y_pred_test = clf.predict(xvalid_tfidf_bigram)
y_score_test = clf.predict_proba(xvalid_tfidf_bigram)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
     "multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
     "f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')









    



training accuracy: 0.9826 
test accuracy: 0.7963 
multiclass aucroc: 0.5138 
f1 score: 0.7326



In [44]:

    
confusion_matrix(y_test,y_pred_test)









    Out[44]:





array([[   20,   727,     0,     2,     0],
       [   30, 78067,    82,  2032,    52],
       [    0,  5271,    90,    10,     0],
       [    1, 11433,     3,   921,     3],
       [    0,   579,     0,    11,    10]])

bi-grams: naive bayes



In [ ]:

    
nb.fit(X_train_tfidf_bi, y_train)
y_pred = nb.predict(X_train_tfidf_bi)
y_pred_test = nb.predict(X_test_tfidf_bi)
y_score_test = nb.predict_proba(X_test_tfidf_bi)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
     "multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
     "f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')









    



training accuracy: 0.6049 
test accuracy: 0.6032 
multiclass aucroc: 0.6421 
f1 score: 0.65

Just text



In [ ]:

    
nb.fit(xtrain_tfidf_bigram, y_train)
y_pred = nb.predict(xtrain_tfidf_bigram)
y_pred_test = nb.predict(xvalid_tfidf_bigram)
y_score_test = nb.predict_proba(xvalid_tfidf_bigram)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
     "multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
     "f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')









    



training accuracy: 0.7989 
test accuracy: 0.7985 
multiclass aucroc: 0.5322 
f1 score: 0.7422 







    



/usr/local/lib/python3.5/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)



In [ ]:

    
confusion_matrix(y_test,y_pred_test)









    Out[ ]:





array([[    0,   735,    11,     3,     0],
       [    0, 77511,  1673,  1067,    12],
       [    0,  4102,  1268,     1,     0],
       [    0, 11641,   173,   547,     0],
       [    0,   597,     0,     1,     2]])

bi/tri grams: random forest



In [10]:

    
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(complaints_df['consumer_complaint_narrative'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_text)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(test_text)



In [13]:

    
X_train_tfidf_n = scipy.sparse.hstack([xtrain_tfidf_ngram, X_train2])
X_test_tfidf_n = scipy.sparse.hstack([xvalid_tfidf_ngram, X_test2])

with meta and text data



In [50]:

    
clf.fit(xtrain_tfidf_ngram, y_train)
y_pred = clf.predict(xtrain_tfidf_ngram)
y_pred_test = clf.predict(xvalid_tfidf_ngram)
y_score_test = clf.predict_proba(xvalid_tfidf_ngram)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
     "multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
     "f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')









    



training accuracy: 0.9824 
test accuracy: 0.7972 
multiclass aucroc: 0.5137 
f1 score: 0.7329



In [51]:

    
confusion_matrix(y_test,y_pred_test)









    Out[51]:





array([[   18,   727,     1,     3,     0],
       [   24, 78178,    80,  1933,    48],
       [    0,  5260,   101,    10,     0],
       [    1, 11460,     6,   891,     3],
       [    0,   580,     0,    10,    10]])

Just text



In [14]:

    
clf.fit(xtrain_tfidf_ngram, y_train)
y_pred = clf.predict(xtrain_tfidf_ngram)
y_pred_test = clf.predict(xvalid_tfidf_ngram)
y_score_test = clf.predict_proba(xvalid_tfidf_ngram)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
     "multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
     "f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')









    



training accuracy: 0.9823 
test accuracy: 0.7974 
multiclass aucroc: 0.5135 
f1 score: 0.7328



In [15]:

    
confusion_matrix(y_test,y_pred_test)









    Out[15]:





array([[   20,   725,     1,     3,     0],
       [   21, 78207,    78,  1911,    46],
       [    0,  5275,    91,     5,     0],
       [    1, 11466,     3,   888,     3],
       [    0,   582,     0,     9,     9]])

bi/tri grams: naive bayes

Text + Meta



In [18]:

    
nb.fit(X_train_tfidf_n, y_train)
y_pred = nb.predict(X_train_tfidf_n)
y_pred_test = nb.predict(X_test_tfidf_n)
y_score_test = nb.predict_proba(X_test_tfidf_n)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
     "multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
     "f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')









    



training accuracy: 0.5945 
test accuracy: 0.592 
multiclass aucroc: 0.6416 
f1 score: 0.6414

Just text



In [19]:

    
nb.fit(xtrain_tfidf_ngram, y_train)
y_pred = nb.predict(xtrain_tfidf_ngram)
y_pred_test = nb.predict(xvalid_tfidf_ngram)
y_score_test = nb.predict_proba(xvalid_tfidf_ngram)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
     "multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
     "f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')









    



training accuracy: 0.7892 
test accuracy: 0.7892 
multiclass aucroc: 0.5392 
f1 score: 0.7426 







    



/usr/local/lib/python3.5/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)



In [20]:

    
confusion_matrix(y_test,y_pred_test)









    Out[20]:





array([[    0,   728,    16,     5,     0],
       [    0, 76042,  2412,  1794,    15],
       [    0,  3868,  1501,     2,     0],
       [    0, 11244,   259,   858,     0],
       [    0,   593,     2,     1,     4]])

Using data from NLP-API

Link to the docs: https://cloud.google.com/natural-language/docs

The NLP-API added sentiment_score (of the consumer_complaint_narrative), mag (complaint sentiment magnitude), ent (extracted entities), ent types (the type of the extracted entities), entity_sent_scores (entities sentiment scores), entity_sent_mag (entities sentiment scores magnitude).



In [10]:

    
mlb = MultiLabelBinarizer(sparse_output=True)
complaints_df3 = complaints_df.copy()



In [11]:

    
complaints_df3.dropna(subset=['company_response_to_consumer'],inplace=True)

one_hot1 = pd.get_dummies(complaints_df3['product'], drop_first=True)
one_hot2 = pd.get_dummies(complaints_df3['subproduct'], drop_first=True)
one_hot3 = pd.get_dummies(complaints_df3['issue'], drop_first=True)
one_hot4 = pd.get_dummies(complaints_df3['subissue'], drop_first=True)
one_hot5 = pd.get_dummies(complaints_df3['state'], drop_first=True)
one_hot6 = pd.get_dummies(complaints_df3['zip_code'], drop_first=True)
one_hot7 = pd.get_dummies(complaints_df3['tags'], drop_first=True)

complaints_df3 = complaints_df3.join(one_hot1, rsuffix='prod')
complaints_df3 = complaints_df3.join(one_hot2, rsuffix='subprod')
complaints_df3 = complaints_df3.join(one_hot3, rsuffix='issue')
complaints_df3 = complaints_df3.join(one_hot4, rsuffix='subissue')
complaints_df3 = complaints_df3.join(one_hot5)
#complaints_df2 = complaints_df2.join(one_hot6)
complaints_df3 = complaints_df3.join(one_hot7)



In [12]:

    
def lister(l):
    l = literal_eval(l)
    double_list = [l]
    return l
def check(l):
    return any(isinstance(el, list) for el in l)



In [13]:

    
complaints_df3.dropna(subset=['entities'],inplace=True)
complaints_df3['entities2'] = complaints_df3['entities'].apply(lister)



In [14]:

    
variable = ['company_response_to_consumer',
            'product',
            'subproduct',
            'issue',
            'subissue',
            'state',
            'zip_code',
            'tags',
            'entities']

X = complaints_df3.drop(columns=variable)
y = complaints_df3['company_response_to_consumer']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)



In [15]:

    
#df['entities'] = df['entities'].apply(lister)
entities = mlb.fit(X['entities2'])
mlb.classes_









    Out[15]:





array(['! II', '#', '# #', ...,
       '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Student Aid Feedback XXXX To XXXX XX/XX/XXXX',
       '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ XXXX XXXX XXXX To XXXX XX/XX/XXXX',
       '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Apparently'],
      dtype=object)



In [16]:

    
X_train_ent = mlb.transform(X_train['entities2'])
X_test_ent = mlb.transform(X_test['entities2'])

Just entities

Random forest

Just with the entity data



In [21]:

    
parameters = {#'class_weight':[None,'balanced'],
              #'max_depth':[2,4,6,8],
              'n_estimators':[10,20,30],
              'n_jobs':[-1]
             }
rf = RandomForestClassifier()
clf = GridSearchCV(rf, parameters)



In [22]:

    
clf.fit(X_train_ent, y_train)
y_pred = clf.predict(X_train_ent)
y_pred_test = clf.predict(X_test_ent)
y_score_test = clf.predict_proba(X_test_ent)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
     "multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
     "f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')









    



training accuracy: 0.9795 
test accuracy: 0.8002 
multiclass aucroc: 0.5144 
f1 score: 0.7352

Naive bayes

Just with the entity data



In [20]:

    
nb.fit(X_train_ent, y_train)
y_pred = nb.predict(X_train_ent)
y_pred_test = nb.predict(X_test_ent)
y_score_test = nb.predict_proba(X_test_ent)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
     "multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
     "f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')









    



training accuracy: 0.8152 
test accuracy: 0.7831 
multiclass aucroc: 0.5318 
f1 score: 0.741

Using both sentiment and meta data



In [23]:

    
variables_to_drop = ['complaint_id',
                     'date_received',
                     'consumer_complaint_narrative',
                     'company_public_response',
                     'company_name',
                    'consumer_consent_provided',
                     'submitted_via',
                     'date_sent_to_company',
                     'timely_response','consumer_disputed',
                    'entity_types',
                     'entity_sentiment_scores',
                     'entity_sentiment_magnitudes',
                     'entities2']
new_x = X.drop(columns=variables_to_drop)



In [24]:

    
X_train, X_test, y_train, y_test = train_test_split(new_x, y, test_size=0.2, random_state=11)



In [25]:

    
clf.fit(X_train,y_train)
y_pred = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
y_score_test = clf.predict_proba(X_test)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
     "multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
     "f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')









    



training accuracy: 0.9265 
test accuracy: 0.7529 
multiclass aucroc: 0.5251 
f1 score: 0.7218



In [26]:

    
nb.fit(X_train_ent, y_train)
y_pred = nb.predict(X_train_ent)
y_pred_test = nb.predict(X_test_ent)
y_score_test = nb.predict_proba(X_test_ent)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
     "multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
     "f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')









    



training accuracy: 0.8152 
test accuracy: 0.7831 
multiclass aucroc: 0.5318 
f1 score: 0.741

Entites with meta data now



In [27]:

    
x_train_nlp = scipy.sparse.hstack([X_train_ent, X_train])
x_test_nlp = scipy.sparse.hstack([X_test_ent, X_test])



In [29]:

    
clf.fit(x_train_nlp,y_train)
y_pred = clf.predict(x_train_nlp)
y_pred_test = clf.predict(x_test_nlp)
y_score_test = clf.predict_proba(x_test_nlp)[:,1]
print("training accuracy:", round(metrics.accuracy_score(y_train,y_pred),4), "\n"
     "test accuracy:", round(metrics.accuracy_score(y_test,y_pred_test),4), "\n"
     "multiclass aucroc:", round(multiclass_roc_auc_score(y_test,y_pred_test),4), '\n'
     "f1 score:",round(f1_score(y_test, y_pred_test, average='weighted'),4),'\n')









    



training accuracy: 0.9822 
test accuracy: 0.7995 
multiclass aucroc: 0.5144 
f1 score: 0.7348

	complaint_id	date_received	product	subproduct	issue	subissue	consumer_complaint_narrative	company_public_response	company_name	state	...	date_sent_to_company	company_response_to_consumer	timely_response	consumer_disputed	sentiment_score	sentiment_magnitude	entities	entity_types	entity_sentiment_scores	entity_sentiment_magnitudes
0	3037829	2018-10-04	Debt collection	Other debt	Other	Other	ERC ONCE AGAIN WENT ON MY CREDIT REPORTS WITH ...	Company can't verify or dispute the facts in t...	ERC	AR	...	2018-10-04	Closed with explanation	True	None	0.0	0.2	['ERC', 'CREDIT REPORTS', 'VALIDATION', 'COLLE...	['ORGANIZATION', 'OTHER', 'OTHER', 'OTHER', 'O...	[-0.20000000298023224, -0.10000000149011612, -...	[0.20000000298023224, 0.10000000149011612, 0.3...
1	2114401	2016-09-15	Debt collection	Other	Other	Other	I have disputed this debt and asked them not t...	Company believes it acted appropriately as aut...	ERC	CA	...	2016-09-15	Closed with non-monetary relief	True	False	-0.1	0.3	['debt', 'debt', 'letter', 'debt']	['OTHER', 'OTHER', 'WORK_OF_ART', 'OTHER']	[-0.4000000059604645, 0, 0, 0]	[0.4000000059604645, 0, 0, 0]
2	2890918	2018-04-30	Debt collection	Other debt	Other	Other	I am a victim of identity theft, company XXXX ...	Company believes it acted appropriately as aut...	ERC	GA	...	2018-04-30	Closed with explanation	True	None	-0.9	0.9	['account', 'Account XXXX', 'XXXX XXXX XXXX', ...	['OTHER', 'EVENT', 'ORGANIZATION', 'OTHER', 'P...	[-0.699999988079071, -0.800000011920929, -0.69...	[1.399999976158142, 1.600000023841858, 1.5, 0....
3	1936042	2016-05-22	Debt collection	Other	Other	Other	I did n't have the money to pay the bills at t...	Company believes it acted appropriately as aut...	ERC	VA	...	2016-05-26	Closed with non-monetary relief	True	False	-0.1	0.3	['money', 'bills', 'payment process']	['OTHER', 'OTHER', 'OTHER']	[-0.20000000298023224, -0.20000000298023224, 0...	[0.20000000298023224, 0.20000000298023224, 0.1...
4	3108556	2018-12-24	Debt collection	I do not know	Communication tactics	Frequent or repeated calls	The company ERC have been calling my phone eve...	Company believes it acted appropriately as aut...	ERC	NC	...	2018-12-24	Closed with explanation	True	None	-0.8	3.2	['ERC', 'phone', 'times', 'number', 'XXXX XXXX...	['ORGANIZATION', 'CONSUMER_GOOD', 'EVENT', 'OT...	[-0.5, -0.30000001192092896, -0.60000002384185...	[3.299999952316284, 0.30000001192092896, 0.600...