In [1]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from lib.utils.db_conn import DBConn
from pickle import dump, load
import os
import re
from src._extraction_utils import DFColumnExtractor, twitter_tokenizer
from IPython.core.pylabtools import figsize
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.externals import joblib
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.tokenize import TweetTokenizer
from sklearn.metrics import classification_report
%matplotlib inline
CONFIG = '/Users/jjardel/dev/distractingdonald/config/db_creds_local.json'
In [2]:
conn = DBConn(CONFIG)
data = conn.export('crazy_tweet_features', schema='clean')
data.tweet_source = data.tweet_source.apply(lambda x: 1 if x == 'android' else 0)
# standardize all urls
data.text = data.text.str.replace('https?:\/\/t.co\/[a-zA-Z0-9\-\.]{8,}', 'twitter_url ')
data.head()
Out[2]:
In [3]:
TEXT_FEATURES = 'text'
NON_TEXT_FEATURES = [
'favorites',
'retweets',
'retweets_to_faves',
'num_exclamation_points',
'num_uppercase_strings',
'is_trump_retweet'
#'is_tweetstorm'
]
text_pipeline = Pipeline([
('extract_text', DFColumnExtractor(TEXT_FEATURES)),
('vect', TfidfVectorizer(tokenizer=twitter_tokenizer))
])
numeric_pipeline = Pipeline([
('extract_nums', DFColumnExtractor(NON_TEXT_FEATURES)),
('scaler', MinMaxScaler())
])
nb_pipeline = Pipeline([
('features', FeatureUnion([
('text_processing', text_pipeline),
('num_processing', numeric_pipeline)
])),
('clf', MultinomialNB())
])
rf_pipeline = Pipeline([
('features', FeatureUnion([
('text_processing', text_pipeline),
('num_processing', numeric_pipeline)
])),
('clf', RandomForestClassifier())
])
nb_params = {
'features__text_processing__vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
'clf__alpha': np.logspace(-2, 0, num=10)
}
rf_params = {
'features__text_processing__vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
'clf__n_estimators': [int(x) for x in np.logspace(1, 3, num=10)]
}
In [4]:
idx_array = np.arange(len(data))
idx_train, idx_test, _ , _ = train_test_split(
idx_array,
data.tweet_source.values,
train_size=0.8,
stratify=data.tweet_source.values,
random_state=42
)
train_df = data.iloc[idx_train, :]
test_df = data.iloc[idx_test, :]
y_train = data.tweet_source.values[idx_train]
y_test = data.tweet_source.values[idx_test]
In [8]:
nb_gs = GridSearchCV(nb_pipeline, nb_params, n_jobs=-1, cv=5)
nb_gs.fit(train_df, y_train)
print('Best Naive Bayes Classifier has accuracy score: {0}'.format(nb_gs.best_score_))
rf_gs = GridSearchCV(rf_pipeline, rf_params, n_jobs=-1, cv=5)
rf_gs.fit(train_df, y_train)
print('Best Random Forest Classifier has accuracy score: {0}'.format(rf_gs.best_score_))
In [6]:
# retrain with best model params using full data
#rf_pipeline.fit(data, data.tweet_source.values, **rf_gs.best_params_)
best_model = nb_gs.best_estimator_
best_model.fit(data, data.tweet_source.values)
joblib.dump(best_model, 'best_model.pkl', compress=1)
#with open('../web_app/pkl_objects/model.pkl', 'wb') as fp:
# dump(best_model, fp)
Out[6]:
In [ ]:
In [7]:
pred_probs = rf_gs.predict_proba(train_df)[:, 1]
for i in range(20):
print(pred_probs[i], train_df.text.values[i], y_train[i])