In [1]:
%load_ext autoreload
%autoreload 2
In [2]:
from __future__ import print_function
from datetime import datetime
import re
import feature_eng
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats
from scipy.stats import norm
import sklearn
from sklearn.decomposition import TruncatedSVD
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.grid_search import ParameterGrid, ParameterSampler, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, roc_auc_score
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
import seaborn as sns
from xgboost import XGBClassifier
%matplotlib inline
print(pd.__version__)
print(sklearn.__version__)
In [3]:
RAND_SEED_SPLIT = RAND_SEED_TUNING = 9161703
N_ITER_TUNING = 100
In [4]:
# df = pd.read_pickle('data/data-post.p')
# df.label = df.label.apply(pd.to_numeric)
# df = extract_features(df)
df = pd.read_csv('data/data_w_features.csv')
df['label'].value_counts()
Out[4]:
In [5]:
df.describe()
Out[5]:
In [5]:
train, validate, test = np.split(df.sample(frac=1, random_state=RAND_SEED_SPLIT), [int(.6 * len(df)), int(.8 * len(df))])
train, validate, test = train.reset_index(drop=True), validate.reset_index(drop=True), test.reset_index(drop=True)
pd.DataFrame([train['label'].value_counts(),
validate['label'].value_counts(),
test['label'].value_counts()],
index=['train', 'val', 'test'])
Out[5]:
In [11]:
fig, ax = plt.subplots(figsize=(15,5))
sns.distplot(train.n_token.apply(lambda x: np.log(x + 1)), color='g', ax=ax)
plt.xlabel('log(n_tokens)')
plt.show()
In [12]:
fig, ax = plt.subplots(figsize=(10,5))
sns.distplot(train.n_char.apply(np.log), fit=norm, fit_kws={'linestyle': '--', 'color': 'r'}, color='b', ax=ax)
plt.xlabel('log(#characters)')
plt.show()
In [13]:
fig, ax = plt.subplots(figsize=(10,5))
# sns.distplot(np.log(validate[validate.label == 1].n_char), kde=True, kde_kws={'alpha': 0}, hist_kws={'color': 'r'}, hist=True)
# sns.distplot(np.log(validate[validate.label == 0].n_char), kde=True, kde_kws={'alpha': 0}, hist_kws={'color': 'g'}, hist=True)
sns.distplot(np.log(train[train.label == 1].n_char), kde=False, hist=False, fit=norm, fit_kws={'color': 'r', 'label': 'Spam'})
sns.distplot(np.log(train[train.label == 0].n_char), kde=False, hist=False, fit=norm, fit_kws={'color': 'g', 'label': 'Not Spam'})
plt.legend()
plt.xlabel('log(n_char)')
plt.ylabel('density')
plt.savefig('by_length.eps', format='eps', dpi=1000)
plt.show()
In [8]:
train.to_pickle('data/train.p')
validate.to_pickle('data/validate.p')
test.to_pickle('data/test.p')
In [17]:
train = pd.read_pickle('data/train.p')
validate = pd.read_pickle('data/validate.p')
test = pd.read_pickle('data/test.p')
In [6]:
PARAM_DISTRIBS = {
BernoulliNB: {
'fit_prior': [True, False],
},
GaussianNB: {},
SVC: {
'C': [2**e for e in [-5, -3, 0, 3, 5, 7, 9]],
'gamma': [2**e for e in [-9, -7, -5, -3, 0, 3, 5, 7, 9]],
},
XGBClassifier: {
'n_estimators': [10, 20, 50, 100],
'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.],
'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9, 1.],
'reg_alpha': [0., 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1., 5., 10.],
'reg_lambda': [0., 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1., 5., 10.],
'scale_pos_weight': np.linspace(1., (train['label'] == 0).sum() / train['label'].sum(), 10)
},
}
f = [
'n_token',
'n_capital',
'n_emoji',
'n_unique_emoji',
'n_number',
'n_mention',
'%_capital',
'%_number',
'%_emoji',
'%_unique_emoji',
'log_char',
'has_phone_number',
'has_bbm_pin',
]
keywords = list(map(lambda x: 'has_pattern_' + x, feature_eng.SPAMMY_PATTERNS))
In [7]:
def classify(X_train, y_train, X_val, y_val, classifiers):
metrics = []
for name, clf in classifiers:
clf.fit(X_train, y_train)
predicted = clf.predict(X_val)
precision = precision_score(y_val, predicted)
recall = recall_score(y_val, predicted)
f1 = f1_score(y_val, predicted)
metrics.append([name, precision, recall, f1])
return pd.DataFrame(metrics, columns=['classifier','precision','recall','f1']).set_index('classifier') * 100
def classify(X_train, y_train, X_val, y_val, clf_class, param_grid=None, n_iter=100, rand_seed=None):
if clf_class == SVC:
scaler = MinMaxScaler(feature_range=(0., 1.))
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_train, y_train = np.array(X_train), np.array(y_train)
X_val, y_val = np.array(X_val), np.array(y_val)
if not param_grid:
param_grid = PARAM_DISTRIBS[clf_class]
n_iter = min(n_iter, len(ParameterGrid(param_grid)))
clf = RandomizedSearchCV(clf_class(), param_grid, n_iter=n_iter, n_jobs=4, cv=2, scoring='f1', random_state=rand_seed)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_val)
precision, recall, f1, _ = precision_recall_fscore_support(y_val, y_pred, warn_for=())
return clf.best_params_, {'precision': precision[1], 'recall': recall[1], 'f1': f1[1]}
def classify_multiple_clfs(X_train, y_train, X_val, y_val, clf_classes, cls_param_grids=None, n_iter=100, rand_seed=None):
if not cls_param_grids:
cls_param_grids = dict()
result = {cls: classify(X_train, y_train, X_val, y_val, cls, param_grid=cls_param_grids.get(cls), n_iter=n_iter, rand_seed=rand_seed) for cls in clf_classes}
best_params = {cls: res[0] for cls, res in result.items()}
metrics = pd.DataFrame({cls: res[1] for cls, res in result.items()}).T
metrics.index = [_.__name__ for _ in metrics.index]
metrics = metrics.sort_index()
return best_params, metrics
In [8]:
X_train_orig = pd.concat([train, validate], ignore_index=True)
y_train = X_train_orig['label']
y_test = test['label']
In [9]:
import warnings
warnings.filterwarnings('ignore', message='.*is ill-defined', append=True)
warnings.filterwarnings('ignore', category=DeprecationWarning)
In [10]:
X_train, X_test = X_train_orig[f], test[f]
best_params_basic, metrics_basic = classify_multiple_clfs(X_train, y_train, X_test, y_test,
[GaussianNB, SVC, XGBClassifier],
n_iter=N_ITER_TUNING, rand_seed=RAND_SEED_TUNING)
metrics_basic
Out[10]:
In [11]:
X_train, X_test = X_train_orig[keywords], test[keywords]
best_params_keywords, metrics_keywords = classify_multiple_clfs(X_train, y_train, X_test, y_test,
[BernoulliNB, SVC, XGBClassifier],
n_iter=N_ITER_TUNING, rand_seed=RAND_SEED_TUNING)
metrics_keywords
Out[11]:
In [12]:
X_train, X_test = X_train_orig[f + keywords], test[f + keywords]
best_params_A, metrics_A = classify_multiple_clfs(X_train, y_train, X_test, y_test,
[GaussianNB, SVC, XGBClassifier],
n_iter=N_ITER_TUNING, rand_seed=RAND_SEED_TUNING)
metrics_A
Out[12]:
In [13]:
bow_pipeline = Pipeline([
('CountVectorizer', CountVectorizer(min_df=5, binary=True)),
('LSA', TruncatedSVD(n_components=100)),
])
bow_pipeline.fit(train['text'])
X_train_bow, X_test_bow = bow_pipeline.transform(X_train_orig['text']), bow_pipeline.transform(test['text'])
best_params_bow, metrics_bow = classify_multiple_clfs(X_train_bow, y_train, X_test_bow, y_test,
[GaussianNB, SVC, XGBClassifier],
n_iter=N_ITER_TUNING, rand_seed=RAND_SEED_TUNING)
metrics_bow
Out[13]:
In [23]:
tfidf_pipeline = Pipeline([
('TfidfVectorizer', TfidfVectorizer(min_df=5)),
('LSA', TruncatedSVD(n_components=100)),
])
tfidf_pipeline.fit(train['text'])
X_train_tfidf, X_test_tfidf = tfidf_pipeline.transform(X_train_orig['text']), tfidf_pipeline.transform(test['text'])
best_params_tfidf, metrics_tfidf = classify_multiple_clfs(X_train_tfidf, y_train, X_test_tfidf, y_test,
[GaussianNB, SVC, XGBClassifier],
n_iter=N_ITER_TUNING, rand_seed=RAND_SEED_TUNING)
metrics_tfidf
Out[23]:
In [24]:
# ftext_skipgram = feature_eng.train_model_fasttext(feature_eng.tokenize(train['text']),
# file_in='tmp/fasttext_train.txt',
# path_out='models/fasttext_skipgram')
In [25]:
# Fasttext
X_train_ftext = feature_eng.extract_fasttext(X_train_orig['text'], fpath_model='models/fasttext_skipgram.bin').reset_index(drop=True)
X_test_ftext = feature_eng.extract_fasttext(test['text'], fpath_model='models/fasttext_skipgram.bin').reset_index(drop=True)
best_params_ftext, metrics_ftext = classify_multiple_clfs(X_train_ftext, y_train, X_test_ftext, y_test,
[GaussianNB, SVC, XGBClassifier],
n_iter=N_ITER_TUNING, rand_seed=RAND_SEED_TUNING)
metrics_ftext
Out[25]:
In [26]:
X_train = pd.concat([pd.DataFrame(X_train_bow), X_train_orig[f]], axis=1)
X_test = pd.concat([pd.DataFrame(X_test_bow), test[f]], axis=1)
best_params_bow_f, metrics_bow_f = classify_multiple_clfs(X_train, y_train, X_test, y_test,
[GaussianNB, SVC, XGBClassifier],
n_iter=N_ITER_TUNING, rand_seed=RAND_SEED_TUNING)
metrics_bow_f
Out[26]:
In [27]:
X_train = pd.concat([pd.DataFrame(X_train_bow), X_train_orig[f + keywords]], axis=1)
X_test = pd.concat([pd.DataFrame(X_test_bow), test[f + keywords]], axis=1)
best_params_bow_f_keywords, metrics_bow_f_keywords = classify_multiple_clfs(X_train, y_train, X_test, y_test,
[GaussianNB, SVC, XGBClassifier],
n_iter=N_ITER_TUNING,
rand_seed=RAND_SEED_TUNING)
metrics_bow_f_keywords
Out[27]:
In [28]:
X_train = pd.concat([pd.DataFrame(X_train_tfidf), X_train_orig[f]], axis=1)
X_test = pd.concat([pd.DataFrame(X_test_tfidf), test[f]], axis=1)
best_params_tfidf_f, metrics_tfidf_f = classify_multiple_clfs(X_train, y_train, X_test, y_test,
[GaussianNB, SVC, XGBClassifier],
n_iter=N_ITER_TUNING, rand_seed=RAND_SEED_TUNING)
metrics_tfidf_f
Out[28]:
In [29]:
X_train = pd.concat([pd.DataFrame(X_train_tfidf), X_train_orig[f + keywords]], axis=1)
X_test = pd.concat([pd.DataFrame(X_test_tfidf), test[f + keywords]], axis=1)
best_params_tfidf_f_keywords, metrics_tfidf_f_keywords = classify_multiple_clfs(X_train, y_train, X_test, y_test,
[GaussianNB, SVC, XGBClassifier],
n_iter=N_ITER_TUNING,
rand_seed=RAND_SEED_TUNING)
metrics_tfidf_f_keywords
Out[29]:
In [43]:
X_train = pd.concat([pd.DataFrame(X_train_ftext), X_train_orig[f]], axis=1)
X_test = pd.concat([pd.DataFrame(X_test_ftext), test[f]], axis=1)
best_params_ftext_f, metrics_ftext_f = classify_multiple_clfs(X_train, y_train, X_test, y_test,
[GaussianNB, SVC, XGBClassifier],
n_iter=N_ITER_TUNING, rand_seed=RAND_SEED_TUNING)
metrics_ftext_f
Out[43]:
In [31]:
X_train = pd.concat([pd.DataFrame(X_train_ftext), X_train_orig[f + keywords]], axis=1)
X_test = pd.concat([pd.DataFrame(X_test_ftext), test[f + keywords]], axis=1)
best_params_ftext_f_keywords, metrics_ftext_f_keywords = classify_multiple_clfs(X_train, y_train, X_test, y_test,
[GaussianNB, SVC, XGBClassifier],
n_iter=N_ITER_TUNING, rand_seed=RAND_SEED_TUNING)
metrics_ftext_f_keywords
Out[31]:
In [51]:
best_params_basic
Out[51]:
In [52]:
best_params_keywords
Out[52]:
In [53]:
best_params_A
Out[53]:
In [54]:
best_params_bow
Out[54]:
In [55]:
best_params_tfidf
Out[55]:
In [56]:
best_params_ftext
Out[56]:
In [57]:
best_params_bow_f
Out[57]:
In [58]:
best_params_bow_f_keywords
Out[58]:
In [59]:
best_params_tfidf_f
Out[59]:
In [60]:
best_params_tfidf_f_keywords
Out[60]:
In [61]:
best_params_ftext_f
Out[61]:
In [62]:
best_params_ftext_f_keywords
Out[62]:
In [118]:
X_train = pd.concat([pd.DataFrame(X_train_ftext), X_train_orig[f + keywords]], axis=1)
X_test = pd.concat([pd.DataFrame(X_test_ftext), test[f + keywords]], axis=1)
best_params_ftext_f_keywords, metrics_ftext_f_keywords = classify_2_multiple_clfs(X_train, y_train, X_test, y_test,
[GaussianNB, SVC, XGBClassifier],
n_iter=n_iter, rand_seed=rand_seed_tuning)
metrics_ftext_f_keywords
Out[118]:
In [11]:
X_train = pd.concat([train[f], extract_fasttext(train.text)], axis=1)
X_val = pd.concat([validate[f], extract_fasttext(validate.text)], axis=1)
y_train = train['label']
y_val = validate['label']
print(classify(X_train, y_train, X_val, y_val, classifiers))
In [12]:
X_train = pd.concat([train[f + keywords], extract_fasttext(train.text)], axis=1)
X_val = pd.concat([validate[f + keywords], extract_fasttext(validate.text)], axis=1)
y_train = train['label']
y_val = validate['label']
print(classify(X_train, y_train, X_val, y_val, classifiers))
In [14]:
X_train = X.toarray()
X_val = vectorizer.transform(validate.text).toarray()
y_train = train['label']
y_val = validate['label']
print(classify(X_train, y_train, X_val, y_val, classifiers))
In [10]:
classifiers = [
('Naive Bayes', GaussianNB()),
('SVM', Pipeline([
('min/max scaler', MinMaxScaler(feature_range=(0.0, 1.0))),
('SVM', SVC())])),
('XGBoost', XGBClassifier())
]
In [11]:
svd = TruncatedSVD(n_components=100)
X_train = svd.fit_transform(X)
X_val = svd.transform(vectorizer.transform(validate.text))
y_train = train['label']
y_val = validate['label']
print(classify(X_train, y_train, X_val, y_val, classifiers))
In [9]:
X_train = pd.concat([train[f].reset_index(), pd.DataFrame(X_train)], axis=1).drop('index', axis=1)
X_val = pd.concat([validate[f].reset_index(), pd.DataFrame(X_val)], axis=1).drop('index', axis=1)
print(classify(X_train, y_train, X_val, y_val, classifiers))
In [12]:
X_train = pd.concat([train[f + keywords].reset_index(), pd.DataFrame(X_train)], axis=1).drop('index', axis=1)
X_val = pd.concat([validate[f + keywords].reset_index(), pd.DataFrame(X_val)], axis=1).drop('index', axis=1)
print(classify(X_train, y_train, X_val, y_val, classifiers))
In [6]:
vectorizer = TfidfVectorizer(min_df=5)
X = vectorizer.fit_transform(train.text)
In [7]:
X_train = X.toarray()
X_val = vectorizer.transform(validate.text).toarray()
y_train = train['label']
y_val = validate['label']
print(classify(X_train, y_train, X_val, y_val, classifiers))
In [8]:
svd = TruncatedSVD(n_components=100)
X_train = svd.fit_transform(X)
X_val = svd.transform(vectorizer.transform(validate.text))
y_train = train['label']
y_val = validate['label']
print(classify(X_train, y_train, X_val, y_val, classifiers))
In [11]:
X_train = pd.concat([train[f].reset_index(), pd.DataFrame(X_train)], axis=1).drop('index', axis=1)
X_val = pd.concat([validate[f].reset_index(), pd.DataFrame(X_val)], axis=1).drop('index', axis=1)
print(classify(X_train, y_train, X_val, y_val, classifiers))
In [25]:
X_train = pd.concat([train[f + keywords].reset_index(), pd.DataFrame(X_train)], axis=1).drop('index', axis=1)
X_val = pd.concat([validate[f + keywords].reset_index(), pd.DataFrame(X_val)], axis=1).drop('index', axis=1)
print(classify(X_train, y_train, X_val, y_val, classifiers))
In [6]:
tv = pd.concat([train, validate])
In [7]:
X_train = pd.concat([tv[f + keywords], extract_fasttext(tv.text)], axis=1)
X_val = pd.concat([test[f + keywords], extract_fasttext(test.text)], axis=1)
y_train = tv['label']
y_val = test['label']
print(classify(X_train, y_train, X_val, y_val, classifiers))
In [8]:
vectorizer = CountVectorizer(min_df=5)
X = vectorizer.fit_transform(tv.text)
svd = TruncatedSVD(n_components=100)
X_train = svd.fit_transform(X)
X_val = svd.transform(vectorizer.transform(test.text))
X_train = pd.concat([tv[f + keywords].reset_index(), pd.DataFrame(X_train)], axis=1).drop('index', axis=1)
X_val = pd.concat([test[f + keywords].reset_index(), pd.DataFrame(X_val)], axis=1).drop('index', axis=1)
y_train = tv['label']
y_val = test['label']
print(classify(X_train, y_train, X_val, y_val, classifiers))
In [9]:
vectorizer = TfidfVectorizer(min_df=5)
X = vectorizer.fit_transform(tv.text)
svd = TruncatedSVD(n_components=100)
X_train = svd.fit_transform(X)
X_val = svd.transform(vectorizer.transform(test.text))
X_train = pd.concat([tv[f + keywords].reset_index(), pd.DataFrame(X_train)], axis=1).drop('index', axis=1)
X_val = pd.concat([test[f + keywords].reset_index(), pd.DataFrame(X_val)], axis=1).drop('index', axis=1)
y_train = tv['label']
y_val = test['label']
print(classify(X_train, y_train, X_val, y_val, classifiers))