In [1]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from lib.utils.db_conn import DBConn

from pickle import dump, load
import os
import re 

from src._extraction_utils import DFColumnExtractor, twitter_tokenizer

from IPython.core.pylabtools import figsize
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.externals import joblib

from sklearn.base import BaseEstimator, TransformerMixin


from nltk.tokenize import TweetTokenizer

from sklearn.metrics import classification_report

%matplotlib inline

CONFIG = '/Users/jjardel/dev/distractingdonald/config/db_creds_local.json'


/Users/jjardel/anaconda/envs/python3/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
/Users/jjardel/anaconda/envs/python3/lib/python3.5/site-packages/sklearn/grid_search.py:43: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.
  DeprecationWarning)

In [2]:
conn = DBConn(CONFIG)
data = conn.export('crazy_tweet_features', schema='clean')

data.tweet_source = data.tweet_source.apply(lambda x: 1 if x == 'android' else 0)

# standardize all urls
data.text = data.text.str.replace('https?:\/\/t.co\/[a-zA-Z0-9\-\.]{8,}', 'twitter_url ')

data.head()


Out[2]:
created_at favorites followers id_str num_statuses quoted_status_id_str quoted_status_text retweets text tweet_source user_id_str user_name user_screen_name retweets_to_faves num_exclamation_points num_characters num_uppercase_strings is_trump_retweet is_tweetstorm
0 2016-06-09 12:29:24 33612.0 22616750.0 740883494117728256 34390.0 None None 12006.0 Crooked Hillary Clinton will be a disaster on ... 1 25073877 Donald J. Trump realDonaldTrump 0.357194 1 139 1 0 0
1 2016-06-09 18:22:21 83496.0 22616750.0 740972317191352320 34390.0 None None 36029.0 Obama just endorsed Crooked Hillary. He wants ... 0 25073877 Donald J. Trump realDonaldTrump 0.431506 1 92 1 0 0
2 2016-06-09 19:26:49 15177.0 22616750.0 740988538464948224 34390.0 None None 4888.0 A message of condolences and support regarding... 0 25073877 Donald J. Trump realDonaldTrump 0.322066 0 106 1 0 0
3 2016-06-09 20:40:32 294381.0 22616750.0 741007091947556864 34390.0 740973710593654784 Delete your account. https://t.co/Oa92sncRQY 166469.0 How long did it take your staff of 823 people ... 0 25073877 Donald J. Trump realDonaldTrump 0.565488 0 138 0 0 0
4 2016-06-09 21:15:11 21200.0 22616750.0 741015811960229893 34390.0 None None 7121.0 Thank you Roseanne, very much appreciated. twi... 0 25073877 Donald J. Trump realDonaldTrump 0.335896 0 66 0 0 0

In [3]:
TEXT_FEATURES = 'text'

NON_TEXT_FEATURES = [
    'favorites',
    'retweets',
    'retweets_to_faves',
    'num_exclamation_points',
    'num_uppercase_strings',
    'is_trump_retweet'
    #'is_tweetstorm'    
]


text_pipeline = Pipeline([
    ('extract_text', DFColumnExtractor(TEXT_FEATURES)),
    ('vect', TfidfVectorizer(tokenizer=twitter_tokenizer))
])

numeric_pipeline = Pipeline([
    ('extract_nums', DFColumnExtractor(NON_TEXT_FEATURES)),
    ('scaler', MinMaxScaler())
])

nb_pipeline = Pipeline([
    ('features', FeatureUnion([
        ('text_processing', text_pipeline),
        ('num_processing', numeric_pipeline)
    ])),
    ('clf', MultinomialNB())
])

rf_pipeline = Pipeline([
    ('features', FeatureUnion([
        ('text_processing', text_pipeline),
        ('num_processing', numeric_pipeline)
    ])),
    ('clf', RandomForestClassifier())
])


nb_params = {
    'features__text_processing__vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'clf__alpha': np.logspace(-2, 0, num=10)
}

rf_params = {
    'features__text_processing__vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'clf__n_estimators': [int(x) for x in np.logspace(1, 3, num=10)]
}

In [4]:
idx_array = np.arange(len(data))
idx_train, idx_test, _ , _ = train_test_split(
    idx_array,
    data.tweet_source.values,
    train_size=0.8,
    stratify=data.tweet_source.values,
    random_state=42
)

train_df = data.iloc[idx_train, :]
test_df = data.iloc[idx_test, :]

y_train = data.tweet_source.values[idx_train]
y_test = data.tweet_source.values[idx_test]

In [8]:
nb_gs = GridSearchCV(nb_pipeline, nb_params, n_jobs=-1, cv=5)
nb_gs.fit(train_df, y_train)

print('Best Naive Bayes Classifier has accuracy score: {0}'.format(nb_gs.best_score_))

rf_gs = GridSearchCV(rf_pipeline, rf_params, n_jobs=-1, cv=5)
rf_gs.fit(train_df, y_train)

print('Best Random Forest Classifier has accuracy score: {0}'.format(rf_gs.best_score_))


Best Naive Bayes Classifier has accuracy score: 0.8676136363636363
Best Random Forest Classifier has accuracy score: 0.8943181818181818

In [6]:
# retrain with best model params using full data
#rf_pipeline.fit(data, data.tweet_source.values, **rf_gs.best_params_)

best_model = nb_gs.best_estimator_
best_model.fit(data, data.tweet_source.values)

joblib.dump(best_model, 'best_model.pkl', compress=1)

#with open('../web_app/pkl_objects/model.pkl', 'wb') as fp:
#    dump(best_model, fp)


Out[6]:
['best_model.pkl']

In [ ]:


In [7]:
pred_probs = rf_gs.predict_proba(train_df)[:, 1]
for i in range(20):
    print(pred_probs[i], train_df.text.values[i], y_train[i])


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-7-5b26e7c02bcb> in <module>()
----> 1 pred_probs = rf_gs.predict_proba(train_df)[:, 1]
      2 for i in range(20):
      3     print(pred_probs[i], train_df.text.values[i], y_train[i])

NameError: name 'rf_gs' is not defined