notebook.community

Edit and run



In [1]:

    
%load_ext autoreload
%autoreload 2



In [2]:

    
from __future__ import print_function
from datetime import datetime
import re

import feature_eng
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats
from scipy.stats import norm
import sklearn
from sklearn.decomposition import TruncatedSVD
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.grid_search import ParameterGrid, ParameterSampler, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, roc_auc_score
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
import seaborn as sns
from xgboost import XGBClassifier

%matplotlib inline

print(pd.__version__)
print(sklearn.__version__)

Go to experiments



In [3]:

    
RAND_SEED_SPLIT = RAND_SEED_TUNING = 9161703
N_ITER_TUNING = 100



In [4]:

    
# df = pd.read_pickle('data/data-post.p')
# df.label = df.label.apply(pd.to_numeric)
# df = extract_features(df)
df = pd.read_csv('data/data_w_features.csv')
df['label'].value_counts()









    Out[4]:





0    22743
1     1859
Name: label, dtype: int64



In [5]:

    
df.describe()









    Out[5]:






  
    
      
      id
      time
      label
      comments
      likes
      time_post
      n_char
      n_token
      n_capital
      n_number
      n_emoji
      n_unique_emoji
      n_mention
      %_capital
      %_number
      %_emoji
      %_unique_emoji
      log_char
    
  
  
    
      count
      2.460200e+04
      2.460200e+04
      24602.000000
      24602.000000
      24602.000000
      2.460200e+04
      24602.000000
      24602.000000
      24602.000000
      24602.000000
      24602.000000
      24602.000000
      24602.000000
      24602.000000
      24602.000000
      24602.000000
      24602.000000
      24602.000000
    
    
      mean
      1.785460e+16
      1.458604e+09
      0.075563
      3898.954231
      47677.054061
      1.445165e+09
      56.993415
      9.026868
      6.652305
      1.184457
      0.739208
      0.390090
      0.721283
      0.064761
      0.017669
      0.033454
      0.019734
      3.391279
    
    
      std
      1.278845e+13
      1.670100e+07
      0.264303
      7566.835904
      56460.777801
      2.271045e+07
      102.058483
      16.934521
      31.435453
      6.122488
      2.687093
      1.034963
      0.895949
      0.137148
      0.078932
      0.137256
      0.091833
      1.099910
    
    
      min
      1.784224e+16
      1.375101e+09
      0.000000
      72.000000
      227.000000
      1.354197e+09
      1.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      25%
      1.785026e+16
      1.450438e+09
      0.000000
      664.000000
      10949.000000
      1.432007e+09
      15.000000
      2.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      2.708050
    
    
      50%
      1.785350e+16
      1.463847e+09
      0.000000
      1475.000000
      25972.000000
      1.448722e+09
      29.000000
      4.000000
      1.000000
      0.000000
      0.000000
      0.000000
      1.000000
      0.025641
      0.000000
      0.000000
      0.000000
      3.367296
    
    
      75%
      1.785843e+16
      1.471747e+09
      0.000000
      4488.000000
      67953.250000
      1.463973e+09
      55.000000
      9.000000
      1.000000
      0.000000
      0.000000
      0.000000
      1.000000
      0.055556
      0.000000
      0.000000
      0.000000
      4.007333
    
    
      max
      1.810824e+16
      1.475015e+09
      1.000000
      116226.000000
      485577.000000
      1.474420e+09
      1490.000000
      228.000000
      722.000000
      109.000000
      88.000000
      15.000000
      10.000000
      1.000000
      1.000000
      1.000000
      1.000000
      7.306531



In [5]:

    
train, validate, test = np.split(df.sample(frac=1, random_state=RAND_SEED_SPLIT), [int(.6 * len(df)), int(.8 * len(df))])
train, validate, test = train.reset_index(drop=True), validate.reset_index(drop=True), test.reset_index(drop=True)
pd.DataFrame([train['label'].value_counts(),
              validate['label'].value_counts(),
              test['label'].value_counts()],
             index=['train', 'val', 'test'])



In [11]:

    
fig, ax = plt.subplots(figsize=(15,5))
sns.distplot(train.n_token.apply(lambda x: np.log(x + 1)), color='g', ax=ax)
plt.xlabel('log(n_tokens)')
plt.show()



In [12]:

    
fig, ax = plt.subplots(figsize=(10,5))
sns.distplot(train.n_char.apply(np.log), fit=norm, fit_kws={'linestyle': '--', 'color': 'r'}, color='b', ax=ax)
plt.xlabel('log(#characters)')
plt.show()



In [13]:

    
fig, ax = plt.subplots(figsize=(10,5))
# sns.distplot(np.log(validate[validate.label == 1].n_char), kde=True, kde_kws={'alpha': 0}, hist_kws={'color': 'r'}, hist=True)
# sns.distplot(np.log(validate[validate.label == 0].n_char), kde=True, kde_kws={'alpha': 0}, hist_kws={'color': 'g'}, hist=True)
sns.distplot(np.log(train[train.label == 1].n_char), kde=False, hist=False, fit=norm, fit_kws={'color': 'r', 'label': 'Spam'})
sns.distplot(np.log(train[train.label == 0].n_char), kde=False, hist=False, fit=norm, fit_kws={'color': 'g', 'label': 'Not Spam'})
plt.legend()
plt.xlabel('log(n_char)')
plt.ylabel('density')
plt.savefig('by_length.eps', format='eps', dpi=1000)
plt.show()



In [8]:

    
train.to_pickle('data/train.p')
validate.to_pickle('data/validate.p')
test.to_pickle('data/test.p')

Experiments



In [17]:

    
train = pd.read_pickle('data/train.p')
validate = pd.read_pickle('data/validate.p')
test = pd.read_pickle('data/test.p')



In [6]:

    
PARAM_DISTRIBS = {
    BernoulliNB: {
        'fit_prior': [True, False],
    },
    GaussianNB: {},
    SVC: {
        'C': [2**e for e in [-5, -3, 0, 3, 5, 7, 9]],
        'gamma': [2**e for e in [-9, -7, -5, -3, 0, 3, 5, 7, 9]],
    },
    XGBClassifier: {
        'n_estimators': [10, 20, 50, 100],
        'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.],
        'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9, 1.],
        'reg_alpha': [0., 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1., 5., 10.],
        'reg_lambda': [0., 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1., 5., 10.],
        'scale_pos_weight': np.linspace(1., (train['label'] == 0).sum() / train['label'].sum(), 10)
    },
}

f = [
    'n_token',
    'n_capital',
    'n_emoji',
    'n_unique_emoji',
    'n_number',
    'n_mention',
    '%_capital',
    '%_number',
    '%_emoji',
    '%_unique_emoji',
    'log_char',
    'has_phone_number',
    'has_bbm_pin',
]

keywords = list(map(lambda x: 'has_pattern_' + x, feature_eng.SPAMMY_PATTERNS))



In [7]:

    
def classify(X_train, y_train, X_val, y_val, classifiers):
    metrics = []
    for name, clf in classifiers:
        clf.fit(X_train, y_train)

        predicted = clf.predict(X_val)

        precision = precision_score(y_val, predicted)
        recall = recall_score(y_val, predicted)
        f1 = f1_score(y_val, predicted)

        metrics.append([name, precision, recall, f1])

    return pd.DataFrame(metrics, columns=['classifier','precision','recall','f1']).set_index('classifier') * 100


def classify(X_train, y_train, X_val, y_val, clf_class, param_grid=None, n_iter=100, rand_seed=None):
    if clf_class == SVC:
        scaler = MinMaxScaler(feature_range=(0., 1.))
        X_train = scaler.fit_transform(X_train)
        X_val = scaler.transform(X_val)
        
    X_train, y_train = np.array(X_train), np.array(y_train)
    X_val, y_val = np.array(X_val), np.array(y_val)
    
    if not param_grid:
        param_grid = PARAM_DISTRIBS[clf_class]
    n_iter = min(n_iter, len(ParameterGrid(param_grid)))
    
    clf = RandomizedSearchCV(clf_class(), param_grid, n_iter=n_iter, n_jobs=4, cv=2, scoring='f1', random_state=rand_seed)
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_val)
    precision, recall, f1, _ = precision_recall_fscore_support(y_val, y_pred, warn_for=())
    return clf.best_params_, {'precision': precision[1], 'recall': recall[1], 'f1': f1[1]}

def classify_multiple_clfs(X_train, y_train, X_val, y_val, clf_classes, cls_param_grids=None, n_iter=100, rand_seed=None):
    if not cls_param_grids:
        cls_param_grids = dict()
    
    result = {cls: classify(X_train, y_train, X_val, y_val, cls, param_grid=cls_param_grids.get(cls), n_iter=n_iter, rand_seed=rand_seed) for cls in clf_classes}
    best_params = {cls: res[0] for cls, res in result.items()}
    
    metrics = pd.DataFrame({cls: res[1] for cls, res in result.items()}).T
    metrics.index = [_.__name__ for _ in metrics.index]
    metrics = metrics.sort_index()
    
    return best_params, metrics



In [8]:

    
X_train_orig = pd.concat([train, validate], ignore_index=True)
y_train = X_train_orig['label']
y_test = test['label']



In [9]:

    
import warnings
warnings.filterwarnings('ignore', message='.*is ill-defined', append=True)
warnings.filterwarnings('ignore', category=DeprecationWarning)

A: Hand-engineered features

Basic



In [10]:

    
X_train, X_test = X_train_orig[f], test[f]

best_params_basic, metrics_basic = classify_multiple_clfs(X_train, y_train, X_test, y_test,
                                                          [GaussianNB, SVC, XGBClassifier],
                                                          n_iter=N_ITER_TUNING, rand_seed=RAND_SEED_TUNING)
metrics_basic









    Out[10]:






  
    
      
      f1
      precision
      recall
    
  
  
    
      GaussianNB
      0.666667
      0.690217
      0.644670
    
    
      SVC
      0.768778
      0.915789
      0.662437
    
    
      XGBClassifier
      0.777457
      0.902685
      0.682741

Keywords



In [11]:

    
X_train, X_test = X_train_orig[keywords], test[keywords]

best_params_keywords, metrics_keywords = classify_multiple_clfs(X_train, y_train, X_test, y_test,
                                                                [BernoulliNB, SVC, XGBClassifier],
                                                                n_iter=N_ITER_TUNING, rand_seed=RAND_SEED_TUNING)
metrics_keywords









    Out[11]:






  
    
      
      f1
      precision
      recall
    
  
  
    
      BernoulliNB
      0.839080
      0.966887
      0.741117
    
    
      SVC
      0.872629
      0.936047
      0.817259
    
    
      XGBClassifier
      0.865465
      0.954128
      0.791878

Basic + Keywords



In [12]:

    
X_train, X_test = X_train_orig[f + keywords], test[f + keywords]

best_params_A, metrics_A = classify_multiple_clfs(X_train, y_train, X_test, y_test,
                                                  [GaussianNB, SVC, XGBClassifier],
                                                  n_iter=N_ITER_TUNING, rand_seed=RAND_SEED_TUNING)
metrics_A









    Out[12]:






  
    
      
      f1
      precision
      recall
    
  
  
    
      GaussianNB
      0.877193
      0.866337
      0.888325
    
    
      SVC
      0.909333
      0.957865
      0.865482
    
    
      XGBClassifier
      0.908851
      0.947658
      0.873096

B: Text features

BoW



In [13]:

    
bow_pipeline = Pipeline([
        ('CountVectorizer', CountVectorizer(min_df=5, binary=True)),
        ('LSA', TruncatedSVD(n_components=100)),
    ])
bow_pipeline.fit(train['text'])

X_train_bow, X_test_bow = bow_pipeline.transform(X_train_orig['text']), bow_pipeline.transform(test['text'])

best_params_bow, metrics_bow = classify_multiple_clfs(X_train_bow, y_train, X_test_bow, y_test,
                                                        [GaussianNB, SVC, XGBClassifier],
                                                        n_iter=N_ITER_TUNING, rand_seed=RAND_SEED_TUNING)
metrics_bow









    Out[13]:






  
    
      
      f1
      precision
      recall
    
  
  
    
      GaussianNB
      0.536100
      0.398274
      0.819797
    
    
      SVC
      0.907357
      0.979412
      0.845178
    
    
      XGBClassifier
      0.912145
      0.928947
      0.895939

TFIDF



In [23]:

    
tfidf_pipeline = Pipeline([
        ('TfidfVectorizer', TfidfVectorizer(min_df=5)),
        ('LSA', TruncatedSVD(n_components=100)),
    ])
tfidf_pipeline.fit(train['text'])

X_train_tfidf, X_test_tfidf = tfidf_pipeline.transform(X_train_orig['text']), tfidf_pipeline.transform(test['text'])

best_params_tfidf, metrics_tfidf = classify_multiple_clfs(X_train_tfidf, y_train, X_test_tfidf, y_test,
                                                        [GaussianNB, SVC, XGBClassifier],
                                                        n_iter=N_ITER_TUNING, rand_seed=RAND_SEED_TUNING)
metrics_tfidf









    Out[23]:






  
    
      
      f1
      precision
      recall
    
  
  
    
      GaussianNB
      0.238520
      0.136397
      0.949239
    
    
      SVC
      0.907407
      0.947514
      0.870558
    
    
      XGBClassifier
      0.908861
      0.906566
      0.911168

Fasttext



In [24]:

    
# ftext_skipgram = feature_eng.train_model_fasttext(feature_eng.tokenize(train['text']),
#                                                   file_in='tmp/fasttext_train.txt',
#                                                   path_out='models/fasttext_skipgram')



In [25]:

    
# Fasttext
X_train_ftext = feature_eng.extract_fasttext(X_train_orig['text'], fpath_model='models/fasttext_skipgram.bin').reset_index(drop=True)
X_test_ftext = feature_eng.extract_fasttext(test['text'], fpath_model='models/fasttext_skipgram.bin').reset_index(drop=True)

best_params_ftext, metrics_ftext = classify_multiple_clfs(X_train_ftext, y_train, X_test_ftext, y_test,
                                                            [GaussianNB, SVC, XGBClassifier],
                                                            n_iter=N_ITER_TUNING, rand_seed=RAND_SEED_TUNING)
metrics_ftext









    Out[25]:






  
    
      
      f1
      precision
      recall
    
  
  
    
      GaussianNB
      0.824945
      0.725000
      0.956853
    
    
      SVC
      0.939821
      0.948320
      0.931472
    
    
      XGBClassifier
      0.924841
      0.928389
      0.921320

C: Hand-engineered + text features

BoW + Basic



In [26]:

    
X_train = pd.concat([pd.DataFrame(X_train_bow), X_train_orig[f]], axis=1)
X_test = pd.concat([pd.DataFrame(X_test_bow), test[f]], axis=1)

best_params_bow_f, metrics_bow_f = classify_multiple_clfs(X_train, y_train, X_test, y_test,
                                                          [GaussianNB, SVC, XGBClassifier],
                                                          n_iter=N_ITER_TUNING, rand_seed=RAND_SEED_TUNING)
metrics_bow_f









    Out[26]:






  
    
      
      f1
      precision
      recall
    
  
  
    
      GaussianNB
      0.580247
      0.444595
      0.835025
    
    
      SVC
      0.939948
      0.967742
      0.913706
    
    
      XGBClassifier
      0.926829
      0.937662
      0.916244

BoW + Basic + Keywords



In [27]:

    
X_train = pd.concat([pd.DataFrame(X_train_bow), X_train_orig[f + keywords]], axis=1)
X_test = pd.concat([pd.DataFrame(X_test_bow), test[f + keywords]], axis=1)

best_params_bow_f_keywords, metrics_bow_f_keywords = classify_multiple_clfs(X_train, y_train, X_test, y_test,
                                                                            [GaussianNB, SVC, XGBClassifier],
                                                                            n_iter=N_ITER_TUNING,
                                                                            rand_seed=RAND_SEED_TUNING)
metrics_bow_f_keywords









    Out[27]:






  
    
      
      f1
      precision
      recall
    
  
  
    
      GaussianNB
      0.736621
      0.627907
      0.890863
    
    
      SVC
      0.930851
      0.977654
      0.888325
    
    
      XGBClassifier
      0.938144
      0.952880
      0.923858

TFIDF + Basic



In [28]:

    
X_train = pd.concat([pd.DataFrame(X_train_tfidf), X_train_orig[f]], axis=1)
X_test = pd.concat([pd.DataFrame(X_test_tfidf), test[f]], axis=1)

best_params_tfidf_f, metrics_tfidf_f = classify_multiple_clfs(X_train, y_train, X_test, y_test,
                                                              [GaussianNB, SVC, XGBClassifier],
                                                              n_iter=N_ITER_TUNING, rand_seed=RAND_SEED_TUNING)
metrics_tfidf_f









    Out[28]:






  
    
      
      f1
      precision
      recall
    
  
  
    
      GaussianNB
      0.733119
      0.634508
      0.868020
    
    
      SVC
      0.937337
      0.965054
      0.911168
    
    
      XGBClassifier
      0.937738
      0.938931
      0.936548

TFIDF + Basic + Keywords



In [29]:

    
X_train = pd.concat([pd.DataFrame(X_train_tfidf), X_train_orig[f + keywords]], axis=1)
X_test = pd.concat([pd.DataFrame(X_test_tfidf), test[f + keywords]], axis=1)

best_params_tfidf_f_keywords, metrics_tfidf_f_keywords = classify_multiple_clfs(X_train, y_train, X_test, y_test,
                                                                                [GaussianNB, SVC, XGBClassifier],
                                                                                n_iter=N_ITER_TUNING,
                                                                                rand_seed=RAND_SEED_TUNING)
metrics_tfidf_f_keywords









    Out[29]:






  
    
      
      f1
      precision
      recall
    
  
  
    
      GaussianNB
      0.881773
      0.856459
      0.908629
    
    
      SVC
      0.934896
      0.959893
      0.911168
    
    
      XGBClassifier
      0.943590
      0.953368
      0.934010

Fasttext + Basic



In [43]:

    
X_train = pd.concat([pd.DataFrame(X_train_ftext), X_train_orig[f]], axis=1)
X_test = pd.concat([pd.DataFrame(X_test_ftext), test[f]], axis=1)

best_params_ftext_f, metrics_ftext_f = classify_multiple_clfs(X_train, y_train, X_test, y_test,
                                                              [GaussianNB, SVC, XGBClassifier],
                                                              n_iter=N_ITER_TUNING, rand_seed=RAND_SEED_TUNING)
metrics_ftext_f









    Out[43]:






  
    
      
      f1
      precision
      recall
    
  
  
    
      GaussianNB
      0.862109
      0.793177
      0.944162
    
    
      SVC
      0.952258
      0.968504
      0.936548
    
    
      XGBClassifier
      0.951189
      0.938272
      0.964467

Fasttext + Basic + Keywords



In [31]:

    
X_train = pd.concat([pd.DataFrame(X_train_ftext), X_train_orig[f + keywords]], axis=1)
X_test = pd.concat([pd.DataFrame(X_test_ftext), test[f + keywords]], axis=1)

best_params_ftext_f_keywords, metrics_ftext_f_keywords = classify_multiple_clfs(X_train, y_train, X_test, y_test,
                                                              [GaussianNB, SVC, XGBClassifier],
                                                              n_iter=N_ITER_TUNING, rand_seed=RAND_SEED_TUNING)
metrics_ftext_f_keywords









    Out[31]:






  
    
      
      f1
      precision
      recall
    
  
  
    
      GaussianNB
      0.880096
      0.834091
      0.931472
    
    
      SVC
      0.960103
      0.973890
      0.946701
    
    
      XGBClassifier
      0.951189
      0.938272
      0.964467

All params for reproduction



In [51]:

    
best_params_basic









    Out[51]:





{sklearn.naive_bayes.GaussianNB: {},
 sklearn.svm.classes.SVC: {'C': 1, 'gamma': 32},
 xgboost.sklearn.XGBClassifier: {'colsample_bytree': 0.9,
  'n_estimators': 100,
  'reg_alpha': 0.001,
  'reg_lambda': 0.01,
  'scale_pos_weight': 1.0,
  'subsample': 0.6}}



In [52]:

    
best_params_keywords









    Out[52]:





{xgboost.sklearn.XGBClassifier: {'colsample_bytree': 0.8,
  'n_estimators': 100,
  'reg_alpha': 0.005,
  'reg_lambda': 0.0,
  'scale_pos_weight': 2.2769652650822669,
  'subsample': 0.9},
 sklearn.svm.classes.SVC: {'C': 0.125, 'gamma': 1},
 sklearn.naive_bayes.BernoulliNB: {'fit_prior': True}}



In [53]:

    
best_params_A









    Out[53]:





{sklearn.naive_bayes.GaussianNB: {},
 sklearn.svm.classes.SVC: {'C': 32, 'gamma': 8},
 xgboost.sklearn.XGBClassifier: {'colsample_bytree': 0.8,
  'n_estimators': 100,
  'reg_alpha': 0.005,
  'reg_lambda': 0.0,
  'scale_pos_weight': 2.2769652650822669,
  'subsample': 0.9}}



In [54]:

    
best_params_bow









    Out[54]:





{sklearn.naive_bayes.GaussianNB: {},
 sklearn.svm.classes.SVC: {'C': 32, 'gamma': 0.125},
 xgboost.sklearn.XGBClassifier: {'colsample_bytree': 1.0,
  'n_estimators': 100,
  'reg_alpha': 0.05,
  'reg_lambda': 0.5,
  'scale_pos_weight': 3.5539305301645339,
  'subsample': 0.6}}



In [55]:

    
best_params_tfidf









    Out[55]:





{sklearn.naive_bayes.GaussianNB: {},
 sklearn.svm.classes.SVC: {'C': 8, 'gamma': 1},
 xgboost.sklearn.XGBClassifier: {'colsample_bytree': 1.0,
  'n_estimators': 100,
  'reg_alpha': 0.05,
  'reg_lambda': 0.5,
  'scale_pos_weight': 3.5539305301645339,
  'subsample': 0.6}}



In [56]:

    
best_params_ftext









    Out[56]:





{sklearn.naive_bayes.GaussianNB: {},
 sklearn.svm.classes.SVC: {'C': 8, 'gamma': 1},
 xgboost.sklearn.XGBClassifier: {'colsample_bytree': 0.9,
  'n_estimators': 100,
  'reg_alpha': 0.001,
  'reg_lambda': 0.01,
  'scale_pos_weight': 1.0,
  'subsample': 0.6}}



In [57]:

    
best_params_bow_f









    Out[57]:





{sklearn.naive_bayes.GaussianNB: {},
 sklearn.svm.classes.SVC: {'C': 8, 'gamma': 1},
 xgboost.sklearn.XGBClassifier: {'colsample_bytree': 0.5,
  'n_estimators': 100,
  'reg_alpha': 1.0,
  'reg_lambda': 0.1,
  'scale_pos_weight': 3.5539305301645339,
  'subsample': 0.7}}



In [58]:

    
best_params_bow_f_keywords









    Out[58]:





{sklearn.naive_bayes.GaussianNB: {},
 sklearn.svm.classes.SVC: {'C': 32, 'gamma': 0.125},
 xgboost.sklearn.XGBClassifier: {'colsample_bytree': 0.6,
  'n_estimators': 100,
  'reg_alpha': 0.05,
  'reg_lambda': 0.05,
  'scale_pos_weight': 3.5539305301645339,
  'subsample': 0.8}}



In [59]:

    
best_params_tfidf_f









    Out[59]:





{sklearn.naive_bayes.GaussianNB: {},
 sklearn.svm.classes.SVC: {'C': 8, 'gamma': 1},
 xgboost.sklearn.XGBClassifier: {'colsample_bytree': 0.6,
  'n_estimators': 100,
  'reg_alpha': 0.05,
  'reg_lambda': 0.05,
  'scale_pos_weight': 3.5539305301645339,
  'subsample': 0.8}}



In [60]:

    
best_params_tfidf_f_keywords









    Out[60]:





{sklearn.naive_bayes.GaussianNB: {},
 sklearn.svm.classes.SVC: {'C': 128, 'gamma': 1},
 xgboost.sklearn.XGBClassifier: {'colsample_bytree': 0.5,
  'n_estimators': 100,
  'reg_alpha': 1.0,
  'reg_lambda': 0.1,
  'scale_pos_weight': 3.5539305301645339,
  'subsample': 0.7}}



In [61]:

    
best_params_ftext_f









    Out[61]:





{sklearn.naive_bayes.GaussianNB: {},
 sklearn.svm.classes.SVC: {'C': 128, 'gamma': 0.03125},
 xgboost.sklearn.XGBClassifier: {'colsample_bytree': 0.5,
  'n_estimators': 100,
  'reg_alpha': 1.0,
  'reg_lambda': 0.1,
  'scale_pos_weight': 3.5539305301645339,
  'subsample': 0.7}}



In [62]:

    
best_params_ftext_f_keywords









    Out[62]:





{sklearn.naive_bayes.GaussianNB: {},
 sklearn.svm.classes.SVC: {'C': 512, 'gamma': 0.0078125},
 xgboost.sklearn.XGBClassifier: {'colsample_bytree': 0.6,
  'n_estimators': 100,
  'reg_alpha': 0.05,
  'reg_lambda': 0.05,
  'scale_pos_weight': 3.5539305301645339,
  'subsample': 0.8}}

Old



In [118]:

    
X_train = pd.concat([pd.DataFrame(X_train_ftext), X_train_orig[f + keywords]], axis=1)
X_test = pd.concat([pd.DataFrame(X_test_ftext), test[f + keywords]], axis=1)

best_params_ftext_f_keywords, metrics_ftext_f_keywords = classify_2_multiple_clfs(X_train, y_train, X_test, y_test,
                                                                                  [GaussianNB, SVC, XGBClassifier],
                                                                                  n_iter=n_iter, rand_seed=rand_seed_tuning)
metrics_ftext_f_keywords









    Out[118]:






  
    
      
      f1
      precision
      recall
    
  
  
    
      GaussianNB
      0.883273
      0.839817
      0.931472
    
    
      SVC
      0.928947
      0.964481
      0.895939
    
    
      XGBClassifier
      0.891735
      0.823656
      0.972081



In [11]:

    
X_train = pd.concat([train[f], extract_fasttext(train.text)], axis=1)
X_val = pd.concat([validate[f], extract_fasttext(validate.text)], axis=1)
y_train = train['label']
y_val = validate['label']

print(classify(X_train, y_train, X_val, y_val, classifiers))









    



             precision     recall         f1
classifier                                  
Naive Bayes  78.947368  93.243243  85.501859
SVM          94.223827  70.540541  80.680062
XGBoost      95.786517  92.162162  93.939394



In [12]:

    
X_train = pd.concat([train[f + keywords], extract_fasttext(train.text)], axis=1)
X_val = pd.concat([validate[f + keywords], extract_fasttext(validate.text)], axis=1)
y_train = train['label']
y_val = validate['label']

print(classify(X_train, y_train, X_val, y_val, classifiers))









    



             precision     recall         f1
classifier                                  
Naive Bayes  84.615385  95.135135  89.567430
SVM          96.439169  87.837838  91.937765
XGBoost      95.821727  92.972973  94.375857

Bag of Words



In [14]:

    
X_train = X.toarray()
X_val = vectorizer.transform(validate.text).toarray()
y_train = train['label']
y_val = validate['label']

print(classify(X_train, y_train, X_val, y_val, classifiers))









    



/usr/local/lib/python2.7/dist-packages/sklearn/utils/validation.py:420: DataConversionWarning: Data with input dtype int64 was converted to float64 by MinMaxScaler.
  warnings.warn(msg, DataConversionWarning)






    



             precision     recall         f1
classifier                                  
Naive Bayes  84.239130  83.783784  84.010840
SVM          98.076923  27.567568  43.037975
XGBoost      97.009967  78.918919  87.034277

Latent Semantic Analysis



In [10]:

    
classifiers = [
    ('Naive Bayes', GaussianNB()),
    ('SVM', Pipeline([
        ('min/max scaler', MinMaxScaler(feature_range=(0.0, 1.0))),
        ('SVM', SVC())])),
    ('XGBoost', XGBClassifier())
]



In [11]:

    
svd = TruncatedSVD(n_components=100)

X_train = svd.fit_transform(X)
X_val = svd.transform(vectorizer.transform(validate.text))
y_train = train['label']
y_val = validate['label']

print(classify(X_train, y_train, X_val, y_val, classifiers))









    



             precision     recall         f1
classifier                                  
Naive Bayes  41.136671  82.162162  54.824166
SVM          98.387097  49.459459  65.827338
XGBoost      96.604938  84.594595  90.201729



In [9]:

    
X_train = pd.concat([train[f].reset_index(), pd.DataFrame(X_train)], axis=1).drop('index', axis=1)
X_val = pd.concat([validate[f].reset_index(), pd.DataFrame(X_val)], axis=1).drop('index', axis=1)

print(classify(X_train, y_train, X_val, y_val, classifiers))









    



             precision     recall         f1
classifier                                  
Naive Bayes  38.231631  82.972973  52.344416
SVM          98.507463  53.513514  69.352014
XGBoost      96.783626  89.459459  92.977528



In [12]:

    
X_train = pd.concat([train[f + keywords].reset_index(), pd.DataFrame(X_train)], axis=1).drop('index', axis=1)
X_val = pd.concat([validate[f + keywords].reset_index(), pd.DataFrame(X_val)], axis=1).drop('index', axis=1)

print(classify(X_train, y_train, X_val, y_val, classifiers))









    



             precision     recall         f1
classifier                                  
Naive Bayes  62.201835  91.621622  74.098361
SVM          97.763578  82.702703  89.604685
XGBoost      97.109827  90.810811  93.854749

TF-IDF



In [6]:

    
vectorizer = TfidfVectorizer(min_df=5)
X = vectorizer.fit_transform(train.text)



In [7]:

    
X_train = X.toarray()
X_val = vectorizer.transform(validate.text).toarray()
y_train = train['label']
y_val = validate['label']

print(classify(X_train, y_train, X_val, y_val, classifiers))









    



/usr/local/lib/python2.7/dist-packages/sklearn/metrics/classification.py:1074: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/usr/local/lib/python2.7/dist-packages/sklearn/metrics/classification.py:1074: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)






    



             precision     recall         f1
classifier                                  
Naive Bayes  14.751958  91.621622  25.412294
SVM           0.000000   0.000000   0.000000
XGBoost      97.643098  78.378378  86.956522

Latent Semantic Analysis



In [8]:

    
svd = TruncatedSVD(n_components=100)

X_train = svd.fit_transform(X)
X_val = svd.transform(vectorizer.transform(validate.text))
y_train = train['label']
y_val = validate['label']

print(classify(X_train, y_train, X_val, y_val, classifiers))









    



             precision     recall         f1
classifier                                  
Naive Bayes  11.856205  92.702703  21.023598
SVM          98.630137  19.459459  32.505643
XGBoost      93.865031  82.702703  87.931034



In [11]:

    
X_train = pd.concat([train[f].reset_index(), pd.DataFrame(X_train)], axis=1).drop('index', axis=1)
X_val = pd.concat([validate[f].reset_index(), pd.DataFrame(X_val)], axis=1).drop('index', axis=1)

print(classify(X_train, y_train, X_val, y_val, classifiers))









    



             precision     recall         f1
classifier                                  
Naive Bayes  65.843621  86.486486  74.766355
SVM          95.481928  85.675676  90.313390
7NN          91.438356  72.162162  80.664653
XGBoost      96.726190  87.837838  92.067989



In [25]:

    
X_train = pd.concat([train[f + keywords].reset_index(), pd.DataFrame(X_train)], axis=1).drop('index', axis=1)
X_val = pd.concat([validate[f + keywords].reset_index(), pd.DataFrame(X_val)], axis=1).drop('index', axis=1)

print(classify(X_train, y_train, X_val, y_val, classifiers))









    



             precision     recall         f1
classifier                                  
Naive Bayes  83.932854  94.594595  88.945362
SVM          96.285714  91.081081  93.611111
7NN          93.706294  72.432432  81.707317
XGBoost      97.159091  92.432432  94.736842

Testing



In [6]:

    
tv = pd.concat([train, validate])



In [7]:

    
X_train = pd.concat([tv[f + keywords], extract_fasttext(tv.text)], axis=1)
X_val = pd.concat([test[f + keywords], extract_fasttext(test.text)], axis=1)
y_train = tv['label']
y_val = test['label']

print(classify(X_train, y_train, X_val, y_val, classifiers))









    



             precision     recall         f1
classifier                                  
Naive Bayes  82.258065  92.487047  87.073171
SVM          93.633952  91.450777  92.529489
7NN          90.445860  73.575130  81.142857
XGBoost      94.652406  91.709845  93.157895



In [8]:

    
vectorizer = CountVectorizer(min_df=5)
X = vectorizer.fit_transform(tv.text)

svd = TruncatedSVD(n_components=100)

X_train = svd.fit_transform(X)
X_val = svd.transform(vectorizer.transform(test.text))
X_train = pd.concat([tv[f + keywords].reset_index(), pd.DataFrame(X_train)], axis=1).drop('index', axis=1)
X_val = pd.concat([test[f + keywords].reset_index(), pd.DataFrame(X_val)], axis=1).drop('index', axis=1)
y_train = tv['label']
y_val = test['label']

print(classify(X_train, y_train, X_val, y_val, classifiers))









    



             precision     recall         f1
classifier                                  
Naive Bayes  64.102564  90.673575  75.107296
SVM          92.816092  83.678756  88.010899
7NN          90.228013  71.761658  79.942280
XGBoost      93.239437  85.751295  89.338731



In [9]:

    
vectorizer = TfidfVectorizer(min_df=5)
X = vectorizer.fit_transform(tv.text)

svd = TruncatedSVD(n_components=100)

X_train = svd.fit_transform(X)
X_val = svd.transform(vectorizer.transform(test.text))
X_train = pd.concat([tv[f + keywords].reset_index(), pd.DataFrame(X_train)], axis=1).drop('index', axis=1)
X_val = pd.concat([test[f + keywords].reset_index(), pd.DataFrame(X_val)], axis=1).drop('index', axis=1)
y_train = tv['label']
y_val = test['label']

print(classify(X_train, y_train, X_val, y_val, classifiers))









    



             precision     recall         f1
classifier                                  
Naive Bayes  81.755196  91.709845  86.446886
SVM          93.220339  85.492228  89.189189
7NN          89.802632  70.725389  79.130435
XGBoost      93.220339  85.492228  89.189189

	id	time	label	comments	likes	time_post	n_char	n_token	n_capital	n_number	n_emoji	n_unique_emoji	n_mention	%_capital	%_number	%_emoji	%_unique_emoji	log_char
count	2.460200e+04	2.460200e+04	24602.000000	24602.000000	24602.000000	2.460200e+04	24602.000000	24602.000000	24602.000000	24602.000000	24602.000000	24602.000000	24602.000000	24602.000000	24602.000000	24602.000000	24602.000000	24602.000000
mean	1.785460e+16	1.458604e+09	0.075563	3898.954231	47677.054061	1.445165e+09	56.993415	9.026868	6.652305	1.184457	0.739208	0.390090	0.721283	0.064761	0.017669	0.033454	0.019734	3.391279
std	1.278845e+13	1.670100e+07	0.264303	7566.835904	56460.777801	2.271045e+07	102.058483	16.934521	31.435453	6.122488	2.687093	1.034963	0.895949	0.137148	0.078932	0.137256	0.091833	1.099910
min	1.784224e+16	1.375101e+09	0.000000	72.000000	227.000000	1.354197e+09	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	1.785026e+16	1.450438e+09	0.000000	664.000000	10949.000000	1.432007e+09	15.000000	2.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	2.708050
50%	1.785350e+16	1.463847e+09	0.000000	1475.000000	25972.000000	1.448722e+09	29.000000	4.000000	1.000000	0.000000	0.000000	0.000000	1.000000	0.025641	0.000000	0.000000	0.000000	3.367296
75%	1.785843e+16	1.471747e+09	0.000000	4488.000000	67953.250000	1.463973e+09	55.000000	9.000000	1.000000	0.000000	0.000000	0.000000	1.000000	0.055556	0.000000	0.000000	0.000000	4.007333
max	1.810824e+16	1.475015e+09	1.000000	116226.000000	485577.000000	1.474420e+09	1490.000000	228.000000	722.000000	109.000000	88.000000	15.000000	10.000000	1.000000	1.000000	1.000000	1.000000	7.306531

	f1	precision	recall
GaussianNB	0.666667	0.690217	0.644670
SVC	0.768778	0.915789	0.662437
XGBClassifier	0.777457	0.902685	0.682741

	f1	precision	recall
BernoulliNB	0.839080	0.966887	0.741117
SVC	0.872629	0.936047	0.817259
XGBClassifier	0.865465	0.954128	0.791878

	f1	precision	recall
GaussianNB	0.877193	0.866337	0.888325
SVC	0.909333	0.957865	0.865482
XGBClassifier	0.908851	0.947658	0.873096

	f1	precision	recall
GaussianNB	0.536100	0.398274	0.819797
SVC	0.907357	0.979412	0.845178
XGBClassifier	0.912145	0.928947	0.895939

	f1	precision	recall
GaussianNB	0.238520	0.136397	0.949239
SVC	0.907407	0.947514	0.870558
XGBClassifier	0.908861	0.906566	0.911168

	f1	precision	recall
GaussianNB	0.824945	0.725000	0.956853
SVC	0.939821	0.948320	0.931472
XGBClassifier	0.924841	0.928389	0.921320

	f1	precision	recall
GaussianNB	0.580247	0.444595	0.835025
SVC	0.939948	0.967742	0.913706
XGBClassifier	0.926829	0.937662	0.916244

	f1	precision	recall
GaussianNB	0.736621	0.627907	0.890863
SVC	0.930851	0.977654	0.888325
XGBClassifier	0.938144	0.952880	0.923858

	f1	precision	recall
GaussianNB	0.733119	0.634508	0.868020
SVC	0.937337	0.965054	0.911168
XGBClassifier	0.937738	0.938931	0.936548

	f1	precision	recall
GaussianNB	0.881773	0.856459	0.908629
SVC	0.934896	0.959893	0.911168
XGBClassifier	0.943590	0.953368	0.934010

	f1	precision	recall
GaussianNB	0.862109	0.793177	0.944162
SVC	0.952258	0.968504	0.936548
XGBClassifier	0.951189	0.938272	0.964467

	f1	precision	recall
GaussianNB	0.880096	0.834091	0.931472
SVC	0.960103	0.973890	0.946701
XGBClassifier	0.951189	0.938272	0.964467

	f1	precision	recall
GaussianNB	0.883273	0.839817	0.931472
SVC	0.928947	0.964481	0.895939
XGBClassifier	0.891735	0.823656	0.972081