In [2]:
import json

# data_path = 'C:/Users/mramire8/Documents/Datasets/twitter'
data_path = '../../data/twitter'

def get_tweets_file(path):
    f = open(path)

    i = 0
    users = []
    for line in f:
        data = line.split("]][[")
        last = len(data)

    for i,tweets in enumerate(data):
            if i == 0:
                t = json.loads(tweets[1:] + "]")
            elif i == (last-1):
                t = json.loads("["+tweets[:-1])
            else:
                t = json.loads("["+tweets+"]")
            users.append(t)

    return users

good = get_tweets_file(data_path + "/good.json")
print "Real users %s" % (len(good))
     
bots = get_tweets_file(data_path + "/bots.json")
print "Bot users %s" % (len(bots))


Real users 883
Bot users 898

In [2]:
print "total: ", 883+898
print "\n".join(sorted(good[0][0].keys()))


total:  1781
contributors
coordinates
created_at
entities
favorite_count
favorited
geo
id
id_str
in_reply_to_screen_name
in_reply_to_status_id
in_reply_to_status_id_str
in_reply_to_user_id
in_reply_to_user_id_str
lang
place
possibly_sensitive
retweet_count
retweeted
source
text
truncated
user

In [3]:
print "\n".join(sorted(good[0][0]['user'].keys()))


contributors_enabled
created_at
default_profile
default_profile_image
description
entities
favourites_count
follow_request_sent
followers_count
following
friends_count
geo_enabled
id
id_str
is_translation_enabled
is_translator
lang
listed_count
location
name
notifications
profile_background_color
profile_background_image_url
profile_background_image_url_https
profile_background_tile
profile_banner_url
profile_image_url
profile_image_url_https
profile_link_color
profile_location
profile_sidebar_border_color
profile_sidebar_fill_color
profile_text_color
profile_use_background_image
protected
screen_name
statuses_count
time_zone
url
utc_offset
verified

Distribution of dates from Tweets

To avoid bias of the classifier by trending topics the data should belong to the same date range. We want to make sure the users belong to the same period of time.

For example, the classifier can learn the topics instead of distinguishing bots from real users. We check for the date distribution of both classes.


In [3]:
import datetime
from collections import Counter

def get_date(date_str):
    return datetime.datetime.strptime(date_str.strip('"'), "%a %b %d %H:%M:%S +0000 %Y")

# datetime.strptime((r.json()[x]["created_at"]).strip('"'), "%a %b %d %H:%M:%S +0000 %Y")
def count_dates(users):
    dates = Counter()
    min_dt = get_date(users[0][0]['created_at'])
    for user in users:
        d = get_date(user[0]['created_at'])
        min_dt = min(min_dt, d)
        dates.update([d])
    return dates, min_dt

good_counts, min_good = count_dates(good)
print "Most common: %s" % good_counts.most_common(3)
print "Latest: %s" % min_good
        
bots_counts, min_bots = count_dates(bots)
print "Most common: %s" % bots_counts.most_common(3)
print "Latest: %s" % min_bots


Most common: [(datetime.datetime(2014, 10, 29, 2, 27, 39), 2), (datetime.datetime(2014, 10, 29, 1, 30, 20), 2), (datetime.datetime(2014, 10, 28, 23, 0, 55), 2)]
Latest: 2009-11-24 02:11:19
Most common: [(datetime.datetime(2014, 10, 29, 6, 46, 9), 2), (datetime.datetime(2014, 10, 30, 0, 17, 5), 2), (datetime.datetime(2014, 10, 20, 22, 24, 23), 1)]
Latest: 2009-11-01 12:56:32

In [4]:
## Number of users that have old tweets in good users
## with tweets not in 2014
print "Old good users %s" %  len([d for d in good_counts.keys() if d.year < 2014])
print "Old bot users %s" %  len([d for d in bots_counts.keys() if d.year < 2014])


Old good users 45
Old bot users 173

In [36]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
 

mpl.style.use('fivethirtyeight')

years    = mdates.YearLocator()   # every year
months   = mdates.MonthLocator()  # every month
yearsFmt = mdates.DateFormatter('%Y')
monthsFmt = mdates.DateFormatter('%Y-%m')
fig = plt.figure(figsize=(7,7))
ax = plt.axes()

gds =[(d,c) for d,c in good_counts.iteritems() if d.year > 2013]
bts =[(d,c) for d,c in bots_counts.iteritems() if d.year > 2013]


kg = [d.toordinal() for d,_ in gds]
kb = [d.toordinal() for d,_ in bts]
wg = [c for _,c in gds]
wb = [c for _,c in bts]
plt.hist([kg,kb], weights=[wg,wb], bins=20, stacked=False, alpha=.7, label=['real', 'bots'])

ax.xaxis.set_major_locator(months)
ax.xaxis.set_major_formatter(monthsFmt)
# ax.xaxis.set_minor_locator(months)
plt.legend(loc='best', frameon=False, numpoints=1)
ax.format_xdata = mdates.DateFormatter('%Y-%m-%d')
fig.autofmt_xdate()
plt.savefig("date_dist.png")


Data Processing

For each user we have tweet in the timeline. Let $s=\{t | t \text{ is a tweet}\}_{1}^{200}$ be the tweets of a user. Let $D=\{(s,y)\}_1^n$, where $y=\{'human','bot'\}$.

Text Processing

For every user we process the text of the twee as follows:

  • Collapse URL
  • Collapse mentions
  • Lower case all text
  • Remove uses that have not tweeted this year (i.e., 2014)

Dataset Format

We create a dataset dictionary containing:

  • data: All tweet objects per user
  • target: Class label of each user. Labels are 0:humans 1:bots
  • user_id: screen name of the user
  • user_name: full name of the user
  • user_text: all text of tweets converted into one single text

In [6]:
## convert the tweet into a data format of text documents
# from sklearn.datasets.base import Bunch
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
def preprocess(string, lowercase, collapse_urls, collapse_mentions):
    if not string:
        return ""
    if lowercase:
        string = string.lower()
#     tokens = []
    if collapse_urls:
        string = re.sub('http\S+', 'THIS_IS_A_URL', string)
    if collapse_mentions:
        string = re.sub('@\S+', 'THIS_IS_A_MENTION', string)
#     if prefix:
#         tokens = ['%s%s' % (prefix, t) for t in tokens]
    return string

def timeline_to_doc(user, lowercase, collapse_urls, collapse_mentions):
    tweets = []
    for tw in user:
        tweets.append(preprocess(tw['text'], lowercase, collapse_urls, collapse_mentions))
    return tweets

def user_to_doc(users, lowercase, collapse_urls, collapse_mentions):
    timeline = []
    user_names = []
    user_id = []
    
    for user in users:
        timeline.append(timeline_to_doc(user, lowercase, collapse_urls, collapse_mentions))
        user_names.append(user[0]['user']['name'])
        user_id.append(user[0]['user']['screen_name'])
    return user_id, user_names, timeline

def bunch_users(class1, class2, vct, lowercase, collapse_urls, collapse_mentions, labels=None):
    labels = None
    if labels is None:
        labels = [0,1]

    user_id, user_names, timeline = user_to_doc(class1, lowercase, collapse_urls, collapse_mentions)
    user_id2, user_names2, timeline2 = user_to_doc(class2, lowercase, collapse_urls, collapse_mentions)
    target = [labels[0]] * len(user_id)
    user_id.extend(user_id2)
    user_names.extend(user_names2)
    timeline.extend(timeline2)
    target.extend([labels[1]]* len(user_id2))
    user_text = [". ".join(t) for  t in timeline]
#     data = Bunch(data=timeline, target=target, user_id=user_id, user_name=user_names)
    data = {'data':timeline, 'target':np.array(target), 'user_id':user_id, 'user_name':user_names, 'user_text':user_text}
    data['bow'] = vct.fit_transform(data['user_text'])

    random_state = np.random.RandomState(5612)        

    indices = np.arange(data['bow'].shape[0])
    random_state.shuffle(indices)
    data['target'] = np.array(data['target'])[indices]
    data_lst = np.array(data['data'] , dtype=object)
    data_lst = data_lst[indices]
    data['data'] = data_lst.tolist()
    data['bow'] = data['bow'][indices]
    data['user_id'] = np.array(data['user_id'])[indices]
    data['user_name'] = np.array(data['user_id'])[indices]
    data['user_text'] = np.array(data['user_id'])[indices]
    data['target_names'] = labels
    return data

In [17]:
import numpy as np 
vct = TfidfVectorizer(encoding='latin1', min_df=5, max_df=1.0, binary=False, ngram_range=(1, 1),
                      token_pattern='\\b\\w+\\b') 

gds =[g for g in good if get_date(g[0]['created_at']).year > 2013]
bts =[b for b in bots if get_date(b[0]['created_at']).year > 2013]

data = bunch_users(gds,bts, vct, True, True, True, labels=['good', 'bots'])

print "Total data:", len(data['target'])


Total data: 1563

Learning Curve: Random Sampling Baseline

Test the learning curve as a baseline. This curve is of a classifier trained on documents.


In [18]:
import sys
import os

sys.path.append(os.path.abspath("/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages"))

from sklearn.learning_curve import learning_curve
from sklearn import linear_model
from sklearn.naive_bayes import MultinomialNB
import sklearn.metrics as metrics

# import brewer2mpl
from sklearn.cross_validation import StratifiedKFold, cross_val_score, KFold, ShuffleSplit
import itertools

def get_tableau():
    tableau20 = [(31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120),  
                 (44, 160, 44), (152, 223, 138), (214, 39, 40), (255, 152, 150),  
                 (148, 103, 189), (197, 176, 213), (140, 86, 75), (196, 156, 148),  
                 (227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199),  
                 (188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)]  

    # Scale the RGB values to the [0, 1] range, which is the format matplotlib accepts.  
    for i in range(len(tableau20)):  
        r, g, b = tableau20[i]  
        tableau20[i] = (r / 255., g / 255., b / 255.)      

    return tableau20

def learning_curve_tweet(data,clf, sizes=None, curve_label=None):

    col = get_tableau()
    colors_n = itertools.cycle(col)
    random_state = np.random.RandomState(56124)
    indices = np.arange(data['bow'].shape[0])
    random_state.shuffle(indices)
    data['target'] = np.array(data['target'])[indices]
    data_lst = np.array(data['data'] , dtype=object)
    data_lst = data_lst[indices]
    data['data'] = data_lst.tolist()
    data['bow'] = data['bow'][indices]
    try:
        data['user_id'] = np.array(data['user_id'])[indices]
        data['user_name'] = np.array(data['user_id'])[indices]
        data['user_text'] = np.array(data['user_id'])[indices]
    except Exception:
        pass

    kcv = KFold(len(data['target']), n_folds=5, random_state=random_state,shuffle=True)

    scoring_fn = 'accuracy'
#     print("Classifier name:", clf.__class__.__name__, "C=", clf.C)
#     print("CV data:", data['bow'])
    if sizes is None:
        sizes = range(20, 4* len(data['target'])/5, 100)
    train_sizes, train_scores, test_scores = learning_curve(
        clf, data['bow'], data['target'], train_sizes=sizes, cv=5, scoring=scoring_fn, n_jobs=2)
    
    current_color = colors_n.next()

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = 1.0 * np.std(test_scores, axis=1) / np.sqrt(5.0)

    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color=current_color)
    plt.plot(train_sizes, test_scores_mean, 'o-', mfc='white', linewidth=2, mew=2, markersize=10, mec=current_color, color=current_color,
             # label="Cross-validation score")
             label="{}".format(curve_label))

    print ("-"*40)
    print ("\nCOST\tMEAN\tSTDEV")
    print ("\n".join(["{0}\t{1:.3f}\t{2:.4f}".format(c,m,s) for c,m,s in zip(train_sizes, test_scores_mean, test_scores_std)]))
    plt.legend(loc="best")
    # plt.savefig('lr-{0}.png'.format(vct.__class__.__name__), bbox_inches="tight", dpi=200, transparent=True)
    plt.savefig('lradapt-sent-sent.png', bbox_inches="tight", dpi=200, transparent=True)
    plt.show()

In [10]:
classifier = 'lr'
if classifier == "mnb":
    clf = MultinomialNB(alpha=1)
else:
    clf = linear_model.LogisticRegression(penalty='l1', C=10)

learning_curve_tweet(data,clf)


----------------------------------------

COST	MEAN	STDEV
20	0.653	0.0318
120	0.762	0.0109
220	0.785	0.0093
320	0.784	0.0072
420	0.798	0.0104
520	0.793	0.0137
620	0.802	0.0102
720	0.817	0.0101
820	0.805	0.0060
920	0.807	0.0099
1020	0.808	0.0074
1120	0.811	0.0081
1220	0.813	0.0070

Preprocessing Options

The following results show how the preprocessing of text affects the classification

Results

Text Processing We observe that there are not significant in the options. However, the best options are:

  • lower=True url=True mention=True
  • lower=True url=False mention=True
  • lower=False url=True mention=True
  • lower=False url=False mention=True

This seems to suggest that collapsing mentions helps more than other processing options.


In [11]:
## try all combinations of data

def try_all(clf, vct, good, bots):
    # Trying all possible options 
    lowercase_opts = [True, False]
    # keep_punctuation_opts = [True, False]
    url_opts = [True, False]
    mention_opts = [True, False]

    argnames = ['lower', 'url', 'mention']
    option_iter = itertools.product( lowercase_opts,
                           url_opts,
                           mention_opts)
    results = []
    for options in option_iter:
        print '\t'.join('%s=%s' % (name, opt) for name, opt in zip(argnames, options))
        data = bunch_users(good, bots, vct, *options)
        cv_scores = cross_val_score(clf, data['bow'], data['target'], cv=5, n_jobs=1)
        print("5-f CV Accuracy: %0.2f (+/- %0.2f)" % (cv_scores.mean(), cv_scores.std() * 2))

In [12]:
clf = linear_model.LogisticRegression(penalty='l1', C=10)

vct = TfidfVectorizer(encoding='latin1', min_df=5, max_df=1.0, binary=False, ngram_range=(1,1),
                      token_pattern='\\b\\w+\\b') 

try_all(clf, vct, gds, bts)


lower=True	url=True	mention=True
5-f CV Accuracy: 0.83 (+/- 0.05)
lower=True	url=True	mention=False
5-f CV Accuracy: 0.82 (+/- 0.04)
lower=True	url=False	mention=True
5-f CV Accuracy: 0.83 (+/- 0.04)
lower=True	url=False	mention=False
5-f CV Accuracy: 0.82 (+/- 0.03)
lower=False	url=True	mention=True
5-f CV Accuracy: 0.83 (+/- 0.05)
lower=False	url=True	mention=False
5-f CV Accuracy: 0.82 (+/- 0.04)
lower=False	url=False	mention=True
5-f CV Accuracy: 0.83 (+/- 0.04)
lower=False	url=False	mention=False
5-f CV Accuracy: 0.82 (+/- 0.04)

In [13]:
## Try by removing stopwords 
clf = linear_model.LogisticRegression(penalty='l2', C=10)

vct = TfidfVectorizer(encoding='latin1', min_df=5, max_df=1.0, binary=False, ngram_range=(1,2),
                      token_pattern='\\b\\w+\\b') 

try_all(clf, vct, gds, bts)


lower=True	url=True	mention=True
5-f CV Accuracy: 0.84 (+/- 0.04)
lower=True	url=True	mention=False
5-f CV Accuracy: 0.83 (+/- 0.05)
lower=True	url=False	mention=True
5-f CV Accuracy: 0.84 (+/- 0.05)
lower=True	url=False	mention=False
5-f CV Accuracy: 0.84 (+/- 0.04)
lower=False	url=True	mention=True
5-f CV Accuracy: 0.84 (+/- 0.04)
lower=False	url=True	mention=False
5-f CV Accuracy: 0.83 (+/- 0.05)
lower=False	url=False	mention=True
5-f CV Accuracy: 0.84 (+/- 0.05)
lower=False	url=False	mention=False
5-f CV Accuracy: 0.84 (+/- 0.04)

In [14]:
## Try by removing stopwords 
clf = linear_model.LogisticRegression(penalty='l2', C=10)

vct = TfidfVectorizer(encoding='latin1', min_df=5, max_df=1.0, binary=False, ngram_range=(1,2),
                      token_pattern='\\b\\w+\\b', stop_words='english') 

try_all(clf, vct, gds, bts)


lower=True	url=True	mention=True
5-f CV Accuracy: 0.84 (+/- 0.06)
lower=True	url=True	mention=False
5-f CV Accuracy: 0.83 (+/- 0.05)
lower=True	url=False	mention=True
5-f CV Accuracy: 0.83 (+/- 0.04)
lower=True	url=False	mention=False
5-f CV Accuracy: 0.84 (+/- 0.05)
lower=False	url=True	mention=True
5-f CV Accuracy: 0.84 (+/- 0.06)
lower=False	url=True	mention=False
5-f CV Accuracy: 0.83 (+/- 0.05)
lower=False	url=False	mention=True
5-f CV Accuracy: 0.83 (+/- 0.04)
lower=False	url=False	mention=False
5-f CV Accuracy: 0.84 (+/- 0.05)

In [15]:
clf = MultinomialNB(alpha=(.54,.46))

vct = TfidfVectorizer(encoding='latin1', min_df=5, max_df=1.0, binary=False, ngram_range=(1,1),
                      token_pattern='\\b\\w+\\b') 

try_all(clf, vct, gds, bts)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-15-7cc20c65d05b> in <module>()
      4                       token_pattern='\\b\\w+\\b') 
      5 
----> 6 try_all(clf, vct, gds, bts)

<ipython-input-11-04110b39c7ea> in try_all(clf, vct, good, bots)
     16         print '\t'.join('%s=%s' % (name, opt) for name, opt in zip(argnames, options))
     17         data = bunch_users(good, bots, vct, *options)
---> 18         cv_scores = cross_val_score(clf, data['bow'], data['target'], cv=5, n_jobs=1)
     19         print("5-f CV Accuracy: %0.2f (+/- %0.2f)" % (cv_scores.mean(), cv_scores.std() * 2))
     20 

C:\Python27\lib\site-packages\sklearn\cross_validation.pyc in cross_val_score(estimator, X, y, scoring, cv, n_jobs, verbose, fit_params, score_func, pre_dispatch)
   1149                                               train, test, verbose, None,
   1150                                               fit_params)
-> 1151                       for train, test in cv)
   1152     return np.array(scores)[:, 0]
   1153 

C:\Python27\lib\site-packages\sklearn\externals\joblib\parallel.pyc in __call__(self, iterable)
    651             self._iterating = True
    652             for function, args, kwargs in iterable:
--> 653                 self.dispatch(function, args, kwargs)
    654 
    655             if pre_dispatch == "all" or n_jobs == 1:

C:\Python27\lib\site-packages\sklearn\externals\joblib\parallel.pyc in dispatch(self, func, args, kwargs)
    398         """
    399         if self._pool is None:
--> 400             job = ImmediateApply(func, args, kwargs)
    401             index = len(self._jobs)
    402             if not _verbosity_filter(index, self.verbose):

C:\Python27\lib\site-packages\sklearn\externals\joblib\parallel.pyc in __init__(self, func, args, kwargs)
    136         # Don't delay the application, to avoid keeping the input
    137         # arguments in memory
--> 138         self.results = func(*args, **kwargs)
    139 
    140     def get(self):

C:\Python27\lib\site-packages\sklearn\cross_validation.pyc in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters)
   1237         estimator.fit(X_train, **fit_params)
   1238     else:
-> 1239         estimator.fit(X_train, y_train, **fit_params)
   1240     test_score = _score(estimator, X_test, y_test, scorer)
   1241     if return_train_score:

C:\Python27\lib\site-packages\sklearn\naive_bayes.pyc in fit(self, X, y, sample_weight)
    323                                        dtype=np.float64)
    324         self._count(X, Y)
--> 325         self._update_feature_log_prob()
    326         self._update_class_log_prior(class_prior=class_prior)
    327         return self

C:\Python27\lib\site-packages\sklearn\naive_bayes.pyc in _update_feature_log_prob(self)
    430     def _update_feature_log_prob(self):
    431         """Apply smoothing to raw counts and recompute log probabilities"""
--> 432         smoothed_fc = self.feature_count_ + self.alpha
    433         smoothed_cc = smoothed_fc.sum(axis=1)
    434 

ValueError: operands could not be broadcast together with shapes (2,26766) (2,) 
lower=True	url=True	mention=True

In [17]:
clf = linear_model.LogisticRegression(penalty='l2', C=1)

vct = TfidfVectorizer(encoding='latin1', min_df=5, max_df=1.0, binary=False, ngram_range=(1,1),
                      token_pattern='\\b\\w+\\b') 

try_all(clf, vct, gds, bts)


lower=True	url=True	mention=True
5-f CV Accuracy: 0.82 (+/- 0.06)
lower=True	url=True	mention=False
5-f CV Accuracy: 0.83 (+/- 0.04)
lower=True	url=False	mention=True
5-f CV Accuracy: 0.81 (+/- 0.05)
lower=True	url=False	mention=False
5-f CV Accuracy: 0.83 (+/- 0.03)
lower=False	url=True	mention=True
5-f CV Accuracy: 0.82 (+/- 0.06)
lower=False	url=True	mention=False
5-f CV Accuracy: 0.83 (+/- 0.04)
lower=False	url=False	mention=True
5-f CV Accuracy: 0.81 (+/- 0.05)
lower=False	url=False	mention=False
5-f CV Accuracy: 0.83 (+/- 0.03)

In [18]:
def print_features(coef, names):
    """ Print sorted list of non-zero features/weights. """
    ### coef = clf.coef_[0]
    ### names = vec.get_feature_names()
    print "*" * 50
    print("Number of Features: %s" % len(names))
    print "\n".join('%s\t%.2f' % (names[j], coef[j]) for j in np.argsort(coef)[::-1] if coef[j] != 0)
    print "*" * 50

In [19]:
clf = linear_model.LogisticRegression(penalty='l1', C=10)
data = bunch_users(gds, bts, vct, True, True, True)
clf.fit(data['bow'], data['target'])
print_features(clf.coef_[0], vct.get_feature_names())


**************************************************
Number of Features: 26766
right	33.26
obama	26.69
check	26.14
ff	22.73
your	21.85
video	16.69
better	15.26
vegas	14.43
grimm	13.98
jasrac	13.91
worldseries	13.48
like	13.31
hp	12.51
fashion	12.48
wa	11.67
solar	11.56
george	11.21
us	10.87
com	10.64
shellshock	10.57
nfl	9.93
katy	9.83
amp	9.60
up	9.59
2012	9.18
facebook	8.83
rock	8.45
everton	8.36
answer	8.19
non	8.16
zu	8.08
berlin	7.89
nomad	7.88
ramirez	7.84
halloween	7.80
we	7.65
u	7.32
tips	7.18
reproducci	7.13
super	7.00
seo	6.97
gt	6.75
best	6.74
remain	6.72
gold	6.53
learn	6.33
money	6.31
cup	6.30
hollywood	6.27
india	6.25
wine	6.22
wow	5.86
after	5.85
vc	5.83
london	5.60
review	5.45
signal	5.40
show	5.36
please	5.20
business	5.12
the	5.03
hockey	4.92
carlos	4.84
affiliate	4.77
cocktail	4.72
you	4.67
free	4.67
fantasy	4.65
pack	4.62
melbourne	4.52
football	4.43
estate	4.41
anime	4.32
top	4.30
series	4.26
winning	4.21
art	4.18
funny	4.09
traveled	4.07
on	4.04
hi	4.02
this	3.97
contest	3.94
homeless	3.93
event	3.89
though	3.88
via	3.86
washington	3.66
watch	3.66
og	3.60
desconto	3.54
awesome	3.43
tip	3.37
beer	3.35
bla	3.34
iot	3.30
mobile	3.29
all	3.13
lol	3.10
great	3.09
congratulations	3.05
cont	3.05
000	2.93
support	2.91
los	2.85
what	2.85
zoom	2.79
webdesign	2.78
v	2.77
cpu	2.76
past	2.74
brooklyn	2.74
food	2.58
creature	2.57
save	2.57
alert	2.56
outsiders	2.55
tlot	2.42
para	2.32
new	2.30
man	2.15
daily	2.14
these	2.12
n	2.10
soon	1.91
incl	1.90
marketing	1.89
skiing	1.89
swift	1.85
camping	1.82
packers	1.80
metal	1.79
hip	1.70
poker	1.65
analyse	1.65
done	1.60
archives	1.57
la	1.55
ukraine	1.47
quest	1.46
nudes	1.44
marijuana	1.41
maine	1.40
divorce	1.36
fight	1.31
disney	1.22
domain	1.20
randomness	1.18
golf	1.14
android	1.11
organic	1.07
photography	1.05
cro	1.04
receita	1.02
traffic	1.00
bey	0.96
experience	0.95
nt	0.90
dog	0.87
every	0.81
enjoy	0.81
mozfest	0.80
ipad	0.76
our	0.74
siege	0.73
hey	0.65
nasdaq	0.65
bangkok	0.62
for	0.58
thanks	0.50
fdp	0.47
page	0.45
mayor	0.42
wales	0.32
more	0.31
want	0.27
swag	0.26
edition	0.24
tcot	0.24
clippers	0.23
hamburg	0.21
finale	0.13
dose	0.12
royalty	0.07
tattoo	0.05
podcast	0.04
social	0.03
regram	0.03
sales	0.01
shopify	-0.06
aws	-0.06
knicks	-0.09
fuck	-0.14
xbox	-0.15
reading	-0.16
kiwi	-0.22
php	-0.26
jobs	-0.27
en	-0.31
ct	-0.32
api	-0.34
lieu	-0.37
excited	-0.39
about	-0.39
ist	-0.40
adam	-0.44
austin	-0.46
lua	-0.50
my	-0.51
phoenix	-0.52
3	-0.59
mia	-0.59
31	-0.65
native	-0.67
jag	-0.73
morning	-0.74
concurso	-0.79
google	-0.83
az	-0.84
es	-0.90
2	-0.93
nonprofits	-0.96
5c	-0.97
radio	-0.99
brisbane	-1.01
porto	-1.05
louis	-1.09
israeli	-1.13
u2	-1.14
3d	-1.21
analytics	-1.28
m	-1.38
nashville	-1.39
css	-1.54
occupycentral	-1.56
allergies	-1.57
k	-1.58
says	-1.64
storytelling	-1.65
c	-1.65
interactive	-1.67
how	-1.73
evening	-1.80
een	-1.82
raptors	-1.90
is	-1.92
photo	-1.98
jeg	-2.08
journalism	-2.11
cube	-2.12
tutorial	-2.32
500px	-2.41
ameba	-2.42
doesn	-2.51
indyref	-2.54
checkout	-2.57
vscocam	-2.64
r	-2.68
a	-2.73
diary	-2.73
sorry	-2.74
rt	-2.84
plugs	-2.84
because	-2.85
this_is_a_url	-2.91
birds	-2.95
ho	-2.98
set	-3.01
doctor	-3.02
coordinate	-3.07
mac	-3.11
ebola	-3.15
clima	-3.17
b2b	-3.28
that	-3.33
chennai	-3.33
pic	-3.34
as	-3.36
hub	-3.42
e	-3.43
acim	-3.47
of	-3.54
i	-3.55
why	-3.56
het	-3.56
cape	-3.56
au	-3.59
portland	-3.60
earthquake	-3.72
w	-3.77
vita	-3.79
df14	-3.80
getglue	-3.91
401	-4.03
sharepoint	-4.03
tigers	-4.09
photoset	-4.15
utah	-4.32
tvtag	-4.39
this_is_a_mention	-4.52
shot	-4.52
at	-4.54
athens	-4.56
francisco	-4.61
von	-4.68
aloha	-4.72
post	-4.74
_	-5.12
rochester	-5.39
un	-5.61
so	-5.68
nowplaying	-5.69
loved	-5.70
tatort	-5.74
email	-5.75
guide	-5.78
nemuritsuzuketeshinu	-5.81
el	-5.85
d	-5.89
rpg	-5.93
law	-6.12
singapore	-6.29
cm	-6.33
t	-6.34
fir	-6.38
dear	-6.44
me	-6.51
ingress	-6.57
times	-6.57
ugh	-6.71
html5	-6.73
in	-6.77
2014	-6.91
if	-7.00
lastfm	-7.16
when	-7.17
david	-7.37
from	-7.50
train	-7.51
liked	-7.69
plath	-7.86
data	-8.19
ruby	-8.20
day	-8.27
iowa	-8.32
11	-8.49
default	-8.74
infusionsoft	-8.87
ah	-9.05
japan	-9.15
block	-9.42
maps	-9.46
they	-9.50
hn	-9.63
tumblr	-9.66
tape	-9.72
hk	-9.86
being	-9.87
niet	-10.02
g	-10.33
islamic	-10.38
10	-10.43
looks	-10.96
se	-11.04
den	-12.25
jokowi	-12.27
giveaway	-12.81
av	-12.93
o	-13.06
km	-13.22
has	-13.27
abundance	-13.63
not	-13.70
yummy	-13.77
nhk	-15.65
yosemite	-15.68
le	-16.38
1	-17.08
play	-17.33
cheese	-17.37
work	-17.41
debate	-17.99
product	-19.37
ve	-23.01
oh	-23.72
**************************************************

For the classifier of choice we look for the best possible paramter to maximize accuracy. For logistic regression we look for the C penalty of regularization.

Results:

We found that C=10 seems to work well with the data.

Note: The data only includes users with tweets in the current year.


In [20]:
# Grid search best estimator
from sklearn.cross_validation import StratifiedKFold, train_test_split
from sklearn.metrics import classification_report
from sklearn.grid_search import GridSearchCV

def grid_search_clf(data, clf, tuned_parameters):
    import copy
    # scores = ['accuracy','precision', 'recall']
    measure = 'accuracy'
    X_train, X_test, y_train, y_test = train_test_split(data['bow'], data['target'], test_size=0.25, random_state=0)
    kcv = StratifiedKFold(y=y_train, n_folds=5, shuffle=True, random_state=546321)
    
    print("# Tuning hyper-parameters for %s" % measure)
    print(len(y_train))
    clf_new = copy.copy(clf)
    clfGS = GridSearchCV(clf_new, tuned_parameters, cv=5, scoring=measure, n_jobs=10, refit=True)
    clfGS.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print(clfGS.best_estimator_)
    print()
    print("Grid scores on development set:")
    print()
    for params, mean_score, scores in clfGS.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() / 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clfGS.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

In [22]:
tuned_parameters = [{'C':  [pow(10,x) for x in range(-3,4)]}]
grid_search_clf(data, linear_model.LogisticRegression(penalty='l1', C=10), tuned_parameters)


# Tuning hyper-parameters for accuracy
1172
Best parameters set found on development set:
LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l1', random_state=None, tol=0.0001)
()
Grid scores on development set:
()
0.539 (+/-0.000) for {'C': 0.001}
0.539 (+/-0.000) for {'C': 0.01}
0.752 (+/-0.010) for {'C': 0.1}
0.808 (+/-0.004) for {'C': 1}
0.842 (+/-0.006) for {'C': 10}
0.823 (+/-0.007) for {'C': 100}
0.825 (+/-0.006) for {'C': 1000}
()
Detailed classification report:
()
The model is trained on the full development set.
The scores are computed on the full evaluation set.
()
             precision    recall  f1-score   support

          0       0.81      0.83      0.82       206
          1       0.80      0.78      0.79       185

avg / total       0.81      0.81      0.81       391

()

In [23]:
tuned_parameters = [{'C':  [pow(10,x) for x in range(-3,4)]}]
grid_search_clf(data, linear_model.LogisticRegression(penalty='l2'), tuned_parameters)


# Tuning hyper-parameters for accuracy
1172
Best parameters set found on development set:
LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)
()
Grid scores on development set:
()
0.539 (+/-0.000) for {'C': 0.001}
0.698 (+/-0.011) for {'C': 0.01}
0.783 (+/-0.005) for {'C': 0.1}
0.828 (+/-0.005) for {'C': 1}
0.854 (+/-0.008) for {'C': 10}
0.861 (+/-0.006) for {'C': 100}
0.855 (+/-0.005) for {'C': 1000}
()
Detailed classification report:
()
The model is trained on the full development set.
The scores are computed on the full evaluation set.
()
             precision    recall  f1-score   support

          0       0.81      0.88      0.85       206
          1       0.86      0.77      0.81       185

avg / total       0.83      0.83      0.83       391

()

Data Representation

We test the effect of the data representation in the performance of the classifier.

Results

We observe that the performance of the classifier does not increase significantly.

Data Representation

We tried two vectorizers: Count and TFIdf. We combined with unigrams and two-grams.

TFIDF + one-grams and two-grams work better.


In [24]:
vct = TfidfVectorizer(encoding='latin1', min_df=5, max_df=1.0, binary=False, ngram_range=(1,2),
                      token_pattern='\\b\\w+\\b') 

try_all(clf, vct, gds, bts)


lower=True	url=True	mention=True
5-f CV Accuracy: 0.82 (+/- 0.03)
lower=True	url=True	mention=False
5-f CV Accuracy: 0.82 (+/- 0.03)
lower=True	url=False	mention=True
5-f CV Accuracy: 0.83 (+/- 0.03)
lower=True	url=False	mention=False
5-f CV Accuracy: 0.82 (+/- 0.03)
lower=False	url=True	mention=True
5-f CV Accuracy: 0.82 (+/- 0.03)
lower=False	url=True	mention=False
5-f CV Accuracy: 0.82 (+/- 0.03)
lower=False	url=False	mention=True
5-f CV Accuracy: 0.83 (+/- 0.03)
lower=False	url=False	mention=False
5-f CV Accuracy: 0.82 (+/- 0.02)

In [25]:
vct = TfidfVectorizer(encoding='latin1', min_df=1, max_df=1.0, binary=False, ngram_range=(1,1),
                      token_pattern='\\b\\w+\\b') 

try_all(clf, vct, gds, bts)


lower=True	url=True	mention=True
5-f CV Accuracy: 0.81 (+/- 0.03)
lower=True	url=True	mention=False
5-f CV Accuracy: 0.81 (+/- 0.05)
lower=True	url=False	mention=True
5-f CV Accuracy: 0.82 (+/- 0.03)
lower=True	url=False	mention=False
5-f CV Accuracy: 0.82 (+/- 0.04)
lower=False	url=True	mention=True
5-f CV Accuracy: 0.81 (+/- 0.03)
lower=False	url=True	mention=False
5-f CV Accuracy: 0.81 (+/- 0.05)
lower=False	url=False	mention=True
5-f CV Accuracy: 0.82 (+/- 0.03)
lower=False	url=False	mention=False
5-f CV Accuracy: 0.82 (+/- 0.05)

In [26]:
vct = CountVectorizer(encoding='latin1', min_df=5, max_df=1.0, binary=False, ngram_range=(1,1),
                      token_pattern='\\b\\w+\\b') 

try_all(clf, vct, gds, bts)


lower=True	url=True	mention=True
5-f CV Accuracy: 0.80 (+/- 0.03)
lower=True	url=True	mention=False
5-f CV Accuracy: 0.79 (+/- 0.05)
lower=True	url=False	mention=True
5-f CV Accuracy: 0.79 (+/- 0.06)
lower=True	url=False	mention=False
5-f CV Accuracy: 0.80 (+/- 0.05)
lower=False	url=True	mention=True
5-f CV Accuracy: 0.80 (+/- 0.05)
lower=False	url=True	mention=False
5-f CV Accuracy: 0.80 (+/- 0.04)
lower=False	url=False	mention=True
5-f CV Accuracy: 0.80 (+/- 0.06)
lower=False	url=False	mention=False
5-f CV Accuracy: 0.80 (+/- 0.05)

Testing Sentence Classifier

Test how does a sentence classifier do classifying into bots and humans.


In [19]:
##sentence detector in tweets
def sentence_detector(timeline):
    tl = []
    print len(timeline)
    for tw in timeline:
        tl.append(tw)
    return tl

def convert2sentence(data):
    all_sent=[]
    all_target=[]
    for user_timeline, label in zip(data['data'], data['target']):
        sentences = user_timeline
        lbls = [label] * len(sentences)
        all_sent.extend(sentences)
        all_target.extend(lbls)
    return all_sent, all_target

In [20]:
vct = TfidfVectorizer(encoding='latin1', min_df=5, max_df=1.0, binary=False, ngram_range=(1,2),
                      token_pattern='\\b\\w+\\b') 
x_sent,y_sent = convert2sentence(data)
random_state = np.random.RandomState(5612) 
indices = np.arange(len(y_sent))
random_state.shuffle(indices)
y_sent = np.array(y_sent)[indices]
data_lst = np.array(x_sent , dtype=object)
data_lst = data_lst[indices]
x_sent = data_lst.tolist()

In [29]:
clf_sent = linear_model.LogisticRegression(penalty='l1', C=10)
print "Total sentences:", len(y_sent)
learning_curve_tweet({'data':x_sent, 'target':y_sent, 'bow':vct.fit_transform(x_sent)},clf, sizes=range(1000, 20000, 1000))


Total sentences: 309320
----------------------------------------

COST	MEAN	STDEV
1000	0.650	0.0010
2000	0.664	0.0007
3000	0.673	0.0005
4000	0.674	0.0012
5000	0.681	0.0009
6000	0.685	0.0016
7000	0.691	0.0011
8000	0.697	0.0008
9000	0.699	0.0011
10000	0.702	0.0005
11000	0.705	0.0010
12000	0.708	0.0009
13000	0.710	0.0007
14000	0.711	0.0009
15000	0.712	0.0013
16000	0.714	0.0011
17000	0.716	0.0012
18000	0.719	0.0011
19000	0.721	0.0010

In [37]:
#other classifiers 
lr2=linear_model.LogisticRegression(penalty='l2', C=1)
# lr2.fit(data['bow'], data['target'])
learning_curve_tweet({'data':x_sent, 'target':y_sent, 'bow':vct.fit_transform(x_sent)}, 
                     lr2, sizes=range(1000, 20000, 1000), curve_label="LRL2-C=1")


----------------------------------------

COST	MEAN	STDEV
1000	0.655	0.0095
2000	0.682	0.0010
3000	0.696	0.0005
4000	0.698	0.0006
5000	0.704	0.0006
6000	0.708	0.0007
7000	0.712	0.0005
8000	0.716	0.0007
9000	0.719	0.0006
10000	0.723	0.0008
11000	0.724	0.0007
12000	0.726	0.0006
13000	0.728	0.0008
14000	0.729	0.0007
15000	0.730	0.0007
16000	0.732	0.0009
17000	0.733	0.0006
18000	0.735	0.0007
19000	0.736	0.0006

In [30]:
#other classifiers 
lr2=linear_model.LogisticRegression(penalty='l2', C=10)
# lr2.fit(data['bow'], data['target'])
learning_curve_tweet({'data':x_sent, 'target':y_sent, 'bow':vct.fit_transform(x_sent)}, 
                     lr2, sizes=range(1000, 20000, 1000), curve_label="LRL2")


----------------------------------------

COST	MEAN	STDEV
1000	0.665	0.0017
2000	0.680	0.0013
3000	0.691	0.0007
4000	0.695	0.0011
5000	0.701	0.0007
6000	0.705	0.0009
7000	0.711	0.0005
8000	0.715	0.0005
9000	0.718	0.0006
10000	0.721	0.0007
11000	0.723	0.0008
12000	0.726	0.0007
13000	0.729	0.0006
14000	0.730	0.0005
15000	0.732	0.0006
16000	0.733	0.0007
17000	0.735	0.0008
18000	0.737	0.0008
19000	0.738	0.0006

In [31]:
mnb = MultinomialNB(alpha=1)
learning_curve_tweet({'data':x_sent, 'target':y_sent, 'bow':vct.fit_transform(x_sent)}, 
                     mnb, sizes=range(1000, 20000, 1000), curve_label="MNB")


----------------------------------------

COST	MEAN	STDEV
1000	0.661	0.0188
2000	0.695	0.0015
3000	0.705	0.0008
4000	0.710	0.0001
5000	0.716	0.0004
6000	0.720	0.0005
7000	0.725	0.0005
8000	0.728	0.0009
9000	0.730	0.0007
10000	0.733	0.0007
11000	0.734	0.0006
12000	0.736	0.0005
13000	0.739	0.0004
14000	0.740	0.0005
15000	0.741	0.0004
16000	0.742	0.0005
17000	0.742	0.0008
18000	0.744	0.0010
19000	0.745	0.0007

Other Features

Term uniqueness?

\begin{align} f(x) = \frac{TF}{UniqueTerms} \end{align}

In [32]:
data.keys()


Out[32]:
['user_id', 'target', 'user_text', 'user_name', 'bow', 'target_names', 'data']

In [69]:
term_doc =vct.inverse_transform(data['bow']) 
print term_doc[:1]


[array([u'2014 2015', u'coordinating', u'bones this_is_a_url', u'and pass',
       u'cooking amp', u'appleton', u'body', u'12 12', u'coors', u'2017',
       u'arguing', u'art by', u'approaching', u'and heart', u'are afraid',
       u'charlatans', u'and inspire', u'bonne journ', u'ble',
       u'bloggers to', u'bbq this_is_a_url', u'3 this_is_a_url',
       u'another new', u'business like', u'creating the', u'da a',
       u'costumes', u'but think', u'coole', u'cookies this_is_a_url',
       u'amp to', u'belfast', u'books please', u'curioso que',
       u'air this_is_a_mention', u'day out', u'a creative', u'barbados',
       u'cost is', u'2nite', u'de gra', u'awesome new', u'big blue',
       u'bonds', u'analyst', u'as this_is_a_mention', u'de semana',
       u'barona buffet', u'cooker', u'cocktail', u'breakfast and',
       u'angers', u'an incident', u'corruption', u'bolsas',
       u'battle this_is_a_url', u'de aqu', u'bedacht', u'be authentic',
       u'de barrio', u'cool mrx', u'company logo', u'again in',
       u'a century', u'bankside in', u'de seu', u'datastorage', u'4 or',
       u'10 surprising', u'cyclists', u'business cards', u'cl ssico',
       u'com essa', u'a sale', u'at social', u'coisa',
       u'animation this_is_a_url', u'at today', u'de kans', u'and rt',
       u'canneslions', u'20 on', u'de enero', u'2013 and', u'269',
       u'cleaning up', u'canci n', u'and search', u'balance of',
       u'bit much', u'at ease', u'arianna', u'be holding', u'android and',
       u'291', u'cool check', u'6 en', u'clima', u'be remembered',
       u'be when', u'bit this_is_a_url', u'ano', u'censorship',
       u'5 minute', u'amp join', u'autistic', u'answer on', u'could tell',
       u'de pinkpop', u'and dad', u'best things', u'app developers',
       u'citation', u'bostonstrong', u'and chief', u'dave and', u'50k',
       u'conozco', u'actually that', u'de pl', u'brighton', u'be updated',
       u'couch', u'be featured', u'airport city', u'21 de', u'21 age',
       u'body art', u'2 mp3s', u'day free', u'considerate', u'can ask',
       u'1272', u'coordinator', u'are building', u'be opening',
       u'blog customer', u'david guetta', u'announcers', u'and canada',
       u'db', u'conversion magnets', u'bitter', u'barry got',
       u'de geschiedenis', u'bonnie', u'de ser', u'cooling',
       u'culture and', u'109', u'chris bosh', u'a raise', u'alexis 2yo',
       u'copywriting tip', u'airport ord', u'apple this_is_a_url',
       u'college', u'another 8', u'3x points', u'cx', u'50 00', u'3 13',
       u'acquiring', u'closing', u'day 27', u'3wrestleteers',
       u'and orange', u'68 km', u'anime this_is_a_url', u'crises',
       u'could save', u'00s', u'06 mi', u'0 0', u'day sunshine',
       u'compassion and', u'cool as', u'bows', u'copies of', u'de renda',
       u'albert schweitzer', u'4 for', u'avan', u'de podemos', u'champon',
       u'10 content', u'bronx', u'court', u'article i', u'and gtx',
       u'christal this_is_a_url', u'argues', u'best day', u'changing your',
       u'blackgirlsupremacy this_is_a_url', u'at 11', u'contest',
       u'came by', u'buffalo chicken', u'at techsailor', u'conference and',
       u'daisy', u'an inbound', u'against is', u'could turn', u'cheryl',
       u'crist o', u'a president', u'2nd half', u'asco', u'a hot',
       u'at viking', u'audience is', u'call upon', u'any longer',
       u'collision', u'days or', u'are 8', u'20 mp', u'commissioner for',
       u'6 unfollowers', u'binary', u'and always', u'de queijo',
       u'4 badge', u'and rain', u'1 hora', u'cheating husband', u'data as',
       u'can express', u'2012 2014', u'beim', u'bought the', u'amp has',
       u'a voc', u'boxes', u'between 2', u'8 tips',
       u'across this_is_a_url', u'airs', u'again because', u'by sue',
       u'cries', u'1 here', u'782 112', u'and more', u'da gua',
       u'de cosas', u'cloud this_is_a_url', u'0 5', u'baths', u'bad idea',
       u'day 11', u'archer', u'best vegetable', u'a disposable',
       u'currently', u'been asking', u'about having', u'ao vivo',
       u'and spent', u'as black', u'a launch', u'an average', u'0 6',
       u'back as', u'burl', u'cartier', u'buscan', u'cheshire', u'24 14',
       u'days away', u'all are', u'content to', u'be perfect',
       u'checked out', u'de im', u'20 20', u'bitcoins', u'and less',
       u'best not', u'de ellos', u'but have', u'day time', u'and coming',
       u'100 ways', u'b test', u'animations', u'10 the', u'00 14', u'140',
       u'a pensar', u'click here', u'5 unique', u'de', u'a donut',
       u'better is', u'bangerz', u'burkinafaso', u'50 of', u'amp mentos',
       u'battle with', u'day this_is_a_mention', u'a false',
       u'an interest', u'amp offices', u'a catchup', u'6 years',
       u'common type', u'black eyed', u'clarence', u'cura', u'141',
       u'criticism', u'architectural', u'colored', u'2014 cadillac',
       u'couple hours', u'bad day', u'about low', u'bb', u'also',
       u'and girls', u'comms', u'1 trillion', u'creating viral',
       u'cobertura', u'cold day', u'could actually', u'day thanks',
       u'be painful', u'0', u'000 in', u'all of', u'05 14', u'cooking',
       u'de 2012', u'comemorando', u'a known', u'de olho', u'23 at',
       u'd work', u'concepts this_is_a_url', u'bins', u'6plus',
       u'at least', u'apple watch', u'applewebkit 537', u'classes in',
       u'38', u'1947', u'breeders cup', u'announces a', u'answer your',
       u'ab peoria', u'a pie', u'android handheld', u'11am', u'circus',
       u'battery drainage', u'a less', u'a california', u'at war',
       u'and information', u'9 in', u'30 minutos', u'all lt', u'31st',
       u'are totally', u'cons this_is_a_url', u'cocoa', u'apa yang',
       u'de estar', u'antes', u'carry out', u'characters to',
       u'architecture in', u'clean eating', u'corner of', u'could write',
       u'20 15', u'been waiting', u'and begin', u'beast', u'chuck',
       u'10 places', u'am this_is_a_url', u'am this_is_a_mention',
       u'army of', u'a career', u'beide', u'16mm', u'album art',
       u'10 minute', u'at microsoft', u'de jogos', u'can no', u'am to',
       u'and yours', u'car bomb', u'alegria', u'als je', u'brann',
       u'000 year', u'cluster', u'but great', u'an item', u'bumpy',
       u'companies have', u'a major', u'but true', u'annual conference',
       u'como vc', u'created equal', u'beside', u'come work', u'child was',
       u'a senior', u'character this_is_a_url', u'buy some', u'commonly',
       u'cu l', u'a community', u'bread in', u'cambian', u'1 mes',
       u'considero', u'allts', u'18 this_is_a_mention', u'a lead',
       u'avventura is', u'alerted our', u'11 of', u'chste', u'bonuses',
       u'a jar', u'congress to', u'a coffee', u'day your', u'as best',
       u'con lo', u'article marketing', u'candidate', u'all people',
       u'at new', u'de juan', u'curriculum sale', u'appointment online',
       u'day because', u'7212 api', u'122 years', u'akku', u'an atheist',
       u'current state', u'dass es', u'a slightly', u'composed',
       u'a laptop', u'2 mentions', u'complete guide', u'cleared',
       u'brewer', u'craft project', u'cuma', u'24 hours', u'826',
       u'captain', u'chocolate milk', u'a rules', u'behind me',
       u'2014 local', u'a cigarette', u'coconut', u'200 royalty',
       u'clayton', u'bind', u'classic car', u'click the', u'91st',
       u'a group', u'amp was', u'boost for', u'alchemy', u'body to',
       u'comics new', u'booth this_is_a_url', u'a marketer', u'am ende',
       u'combat', u'appreciate this', u'break', u'coach and', u'blu',
       u'amp property', u'david thoreau', u'a powerful', u'blokes',
       u'and things', u'authentic', u'at benghazi', u'be paid',
       u'coffee shop', u'cken', u'call us', u'cuomo', u'das urnas',
       u'akibug', u'acim', u'converter', u'7pm', u'cash flow',
       u'aaron swartz', u'committee this_is_a_url', u'boy with',
       u'clients', u'111 this_is_a_url', u'2o', u'dawkins', u'calls a',
       u'10 en', u'das was', u'beach is', u'a premium', u'aftermath',
       u'carregar', u'arvo', u'colts', u'always tell', u'00pm', u'05',
       u'best for', u'bbb', u'a marriage', u'apis this_is_a_url',
       u'biballo', u'11 23', u'18 rt', u'actually in', u'closes in',
       u'cmos', u'best at', u'an isolated', u'3rd quarter', u'11 new',
       u'11', u'davide', u'am too', u'11 this_is_a_mention', u'circulaci',
       u'0 1084', u'all signal', u'data analysis', u'bedoel je',
       u'any case', u'a captain', u'black this_is_a_url', u'a review',
       u'bitcoin exchange', u'blog macrumors', u'call your', u'chipmunk',
       u'amp social', u'11 28', u'connecting', u'antwerp this_is_a_url',
       u'15 16', u'a spot', u'angst', u'as mentioned', u'clipe', u'awards',
       u'cat pet', u'alma', u'apple a', u'appreciate that', u'an insult',
       u'best this_is_a_url', u'challenge your', u'brslabs', u'bolster',
       u'church planter', u'calling your', u'auction', u'0 4',
       u'a masterpiece', u'casinos this_is_a_url', u'com um', u'add me',
       u'amp great', u'clutter', u'bater a', u'all this', u'blind to',
       u'bring me', u'contest win', u'algae', u'be moving', u'aflevering',
       u'al lado', u'apologize for', u'can possibly', u'be put',
       u'accelerate your', u'begin this_is_a_url', u'de alta', u'benmont',
       u'10 in', u'de no', u'constitutional amendment', u'characters on',
       u'accelerating', u'cookbook', u'convince', u'cable this_is_a_url',
       u'apple pay', u'blog with', u'are thrilled',
       u'breastcancerawareness month', u'de las', u'amp think', u'2001',
       u'brslabs aisight', u'24c', u'big world', u'brief', u'also got',
       u'actually is', u'7 ans', u'cantwait', u'a cutie', u'deal of',
       u'butting', u'bangkok thailand', u'beasts', u'benefits and',
       u'accordingly', u'de iphone', u'celebs', u'beware', u'any kind',
       u'and realized', u'4 am', u'beta program', u'125k', u'26 2014',
       u'as me', u'and mid', u'amp marc', u'a st', u'4 hours', u'assange',
       u'dashboard for', u'an invite', u'a faith', u'anyone you',
       u'canyon this_is_a_url', u'all be', u'accepted', u'000 for',
       u'boutique', u'be different', u'black white', u'additions',
       u'by playing', u'blues and', u'date with', u'beta testers',
       u'child molester', u'bezig', u'attach', u'a dime', u'cups of',
       u'brooklyn beta', u'and followers', u'commerce strategy',
       u'a votar', u'an interactive', u'c is', u'am currently',
       u'and live', u'created our', u'a bucket'], 
      dtype='<U48')]

In [34]:
print type(term_doc)
print len(term_doc)
print len(data['user_id'])
text = "i love going to the beach beach beach beach"
bottext = vct.transform([text])
print bottext
print bottext.nonzero()


<type 'list'>
1563
1563
  (0, 102110)	0.164022815843
  (0, 100980)	0.0786740738364
  (0, 93280)	0.260899401027
  (0, 93041)	0.0731253068234
  (0, 56926)	0.151220176225
  (0, 46445)	0.196699781766
  (0, 46209)	0.0912204215625
  (0, 40645)	0.186670642381
  (0, 40614)	0.16983634228
  (0, 14707)	0.871734635652
(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), array([102110, 100980,  93280,  93041,  56926,  46445,  46209,  40645,
        40614,  14707]))

In [62]:
def uniqueness_features(inverse_vector, vct):
    unique = len(vct.get_feature_names())
    x = []
    for d in inverse_vector:
        x.append([1. * len(d)/unique])
    return np.array(x)

UX = uniqueness_features(term_doc, vct)
print UX.shape
print data['target'].shape


(1563L, 1L)
(1563L,)

In [64]:
def cross_val(X, y, clf, cv=5): 
#     data = bunch_users(good, bots, vct, *options)
    cv_scores = cross_val_score(clf, X, y, cv=cv, n_jobs=1)
    print("5-f CV Accuracy: %0.2f (+/- %0.2f)" % (cv_scores.mean(), cv_scores.std() * 2))

uclf = linear_model.LogisticRegression(penalty='l2', C=1)
cross_val(UX, data['target'], uclf)
print UX[:10], data['target'][:10]


5-f CV Accuracy: 0.54 (+/- 0.00)
[[ 0.00594232]
 [ 0.00824257]
 [ 0.00069705]
 [ 0.00711858]
 [ 0.00710116]
 [ 0.00765008]
 [ 0.00794633]
 [ 0.00464407]
 [ 0.00148994]
 [ 0.00925329]] [1 0 0 0 0 1 0 0 1 0]

In [68]:
plt.scatter(UX, data['target'], c=data['target'])
plt.show()



In [21]:
y0 = []
y1 = []
for d,l in zip(data['data'],data['target']):
    if l == 0:
        y0.append(len(d))
    else:
        y1.append(len(d))
print "Real users:",len(y0)
print "Bot users:",len(y1)


Real users: 838
Bot users: 725

In [22]:
tw_y0 = Counter()
for c in y0:
    tw_y0[c] +=1
tw_y1 = Counter()
for c in y1:
    tw_y1[c] +=1

In [23]:
print "Most common:", tw_y0.most_common(3)
print "Least common", tw_y0.most_common()[:-3-1:-1]
print "Least tweets:", min(tw_y0.keys())
print np.mean(y0)


Most common: [(200, 682), (199, 80), (198, 38)]
Least common [(127, 1), (157, 1), (101, 1)]
Least tweets: 8
199.150357995

In [24]:
print "Most common:", tw_y1.most_common(3)
print "Least common", tw_y1.most_common()[:-3-1:-1]
print "Least tweets:", min(tw_y1.keys())
print np.mean(y1)


Most common: [(200, 615), (199, 48), (198, 13)]
Least common [(126, 1), (123, 1), (116, 1)]
Least tweets: 1
196.457931034

In [25]:
from sklearn.cross_validation import train_test_split, ShuffleSplit
from sklearn.metrics import accuracy_score
vct_sent = TfidfVectorizer(encoding='latin1', min_df=5, max_df=1.0, binary=False, ngram_range=(1,1),
                      token_pattern='\\b\\w+\\b') 

x_train, x_test,  y_train, y_test = train_test_split(x_sent, y_sent, test_size=0.5, random_state=42)
x_train_bow = vct_sent.fit_transform(x_train)
x_test_bow = vct_sent.transform(x_test)
lrl2=linear_model.LogisticRegression(penalty='l2', C=1)
lrl2.fit(x_train_bow,y_train)


Out[25]:
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)

In [26]:
proba = lrl2.predict_proba(x_test_bow)
pred = lrl2.predict(x_test_bow)

print "Accuracy %f" % accuracy_score(y_test, pred)


Accuracy 0.770199

In [39]:
def print_top_terms(model, terms, n=20):
    print '\nTop Coefficients'
    coef = model.coef_[0]
    srted = np.argsort(coef)
    topi = srted[::-1][:n]
    boti = srted[:n]
    
    print 'Real Terms:\n' + '\n'.join('%s (%g)' % (n, c) for n, c in zip(terms[topi], coef[topi]))
    print '\nBot Terms:\n' + '\n'.join('%s (%g)' % (n, c) for n, c in zip(terms[boti], coef[boti]))
    print '\nintercept=%g' % model.intercept_

def print_terms_and_coef(row, terms, coef):
    indices = sorted(row.indices, key=lambda x: coef[x])
    print 'Top Terms:'
    for i in indices:
        if coef[i] != 0:
            print terms[i], "%.3f" % coef[i]
    print
    
def error_analysis(clf, predicted, predicted_proba, X, tweets, terms):
    
    print_top_terms(clf, np.array(terms))
    
    print '\nERRORS:'
    for i in range(predicted_proba.shape[0]):
        probability = predicted_proba[i][predicted[i]]
        # If we're very wrong.
        if predicted[i] != y_test[i] and probability > .97:
            
            print '\npred=%d (%g) truth=%d \ntext=%s ' % (predicted[i],
                                                            probability,
                                                            y_test[i],
                                                            tweets[i])
            print_terms_and_coef(X.getrow(i), terms, clf.coef_[0])
    
error_analysis(lrl2, pred, proba, x_test_bow, x_test, vct_sent.get_feature_names())


Top Coefficients
Real Terms:
escutando (6.24372)
stupiddope (5.47978)
seo (4.87362)
ff (4.84588)
metal (4.80181)
obama (4.47136)
kunstbende (4.22357)
snowboard (4.16969)
affiliate (4.09154)
burien (4.07725)
anastaciabestofyou (4.02571)
regram (4.00298)
daily (3.94676)
thw (3.93696)
bubblews (3.92649)
wondersofthemonsoon (3.87913)
wallpaper (3.87001)
web2 (3.84482)
divide (3.83077)
analyse (3.80664)

Bot Terms:
petherick (-5.02746)
bfb (-4.99993)
m_hash (-4.64134)
sinkan (-4.34529)
codermike (-4.15832)
utpol (-4.13839)
km (-4.03588)
hackernews (-3.74473)
spbuzz (-3.52067)
openco (-3.50943)
lastfm (-3.43062)
txfm (-3.40478)
tatort (-3.30923)
tas14 (-3.30751)
apartmenthackers (-3.20059)
nowplaying (-3.15324)
sharepoint (-3.09766)
gamasutra (-3.08537)
shenzhen (-2.944)
o (-2.81219)

intercept=-1.0703

ERRORS:

pred=0 (0.973712) truth=1 
text=THIS_IS_A_MENTION THIS_IS_A_MENTION  otro ,que a lo mejor no sabe que es la udef 
Top Terms:
que -2.025
es -1.541
this_is_a_mention -1.537
lo -0.665
sabe -0.627
otro -0.560
no -0.382
la -0.292
mejor -0.255
a -0.204


pred=1 (0.979979) truth=0 
text=massive roi for uk affiliate marketing in 2013 THIS_IS_A_URL via THIS_IS_A_MENTION 
Top Terms:
this_is_a_mention -1.537
in 0.112
uk 0.463
massive 0.532
this_is_a_url 0.738
roi 1.076
for 1.389
2013 1.611
via 3.345
marketing 3.655
affiliate 4.092


pred=1 (0.970906) truth=0 
text=the best new photography books you need right now THIS_IS_A_URL via THIS_IS_A_MENTION 
Top Terms:
this_is_a_mention -1.537
need -0.008
now 0.241
the 0.546
this_is_a_url 0.738
right 1.198
best 1.366
books 1.381
new 1.798
you 1.896
photography 3.183
via 3.345


pred=0 (0.97089) truth=1 
text=THIS_IS_A_MENTION scherp! dat dachten wij dus ook! ;-) 
Top Terms:
ook -1.934
this_is_a_mention -1.537
dat -1.145
dus -1.111
wij -0.571


pred=0 (0.972298) truth=1 
text=rt THIS_IS_A_MENTION ξεχασαν να πουν του λοβερδου να παει εκκλησια και διεταξε εδε .πως κανει ετσι;
εμεις εχουμε να παμε απ´ τα βαφτισια μας. 
Top Terms:
rt -1.973
this_is_a_mention -1.537


pred=0 (0.972298) truth=1 
text=rt THIS_IS_A_MENTION 👻 
Top Terms:
rt -1.973
this_is_a_mention -1.537


pred=0 (0.971393) truth=1 
text=photoset: iraffiruse: frozach submitted THIS_IS_A_URL 
Top Terms:
photoset -2.726
submitted -1.102
this_is_a_url 0.738


pred=0 (0.979218) truth=1 
text=rt THIS_IS_A_MENTION hoje estreia #rioeuteamo, filme que vai fazer você se apaixonar. veja o trailer: THIS_IS_A_URL 
Top Terms:
o -2.812
que -2.025
rt -1.973
se -1.613
this_is_a_mention -1.537
vai -1.332
voc -1.329
hoje -1.180
veja -0.782
fazer 0.092
filme 0.101
this_is_a_url 0.738
trailer 0.979


pred=0 (0.97592) truth=1 
text=rt THIS_IS_A_MENTION não perca a hora! a largada para o gp da inglaterra de 2014 será neste domingo, 9h (horário de brasília). #f1 
Top Terms:
o -2.812
rt -1.973
rio -1.551
this_is_a_mention -1.537
de -1.259
da -1.165
gp -0.902
ser -0.552
hor -0.458
lia -0.383
bras -0.376
a -0.204
n -0.196
domingo -0.147
hora -0.146
neste -0.067
2014 -0.039
para 0.246
f1 0.366


pred=0 (0.972298) truth=1 
text=rt THIS_IS_A_MENTION ▒▀██▀█▐█▀█ ▒▒██▀▀▀▀▀██▀██▀▌██── ██ ██▀▌▒ 
▒▒██─█▐█▀▀ ▒▒▀▀▀██▒▒██▒██▀▒▒██ .██▒██▀▒▒ 
▒▄██▄█▐█─█ ▒▒▐▀▀▀▀▒▒██▒██▄▌▒▒██ ▒▒… 
Top Terms:
rt -1.973
this_is_a_mention -1.537


pred=1 (0.977198) truth=0 
text=THIS_IS_A_MENTION “effective sales and marketing is getting to the truth as quickly as possible” #sales #marketing #cmworld #contentmarketing 
Top Terms:
this_is_a_mention -1.537
as -0.127
is 0.355
possible 0.491
the 0.546
and 0.555
to 0.660
quickly 0.718
getting 0.831
contentmarketing 0.877
cmworld 0.955
effective 1.790
truth 1.963
sales 2.981
marketing 3.655


pred=1 (0.97825) truth=0 
text=[tickets giveaway] golf lovers wanted! get your free tickets by logging in our website and answer the question.... THIS_IS_A_URL 
Top Terms:
in 0.112
giveaway 0.277
lovers 0.475
wanted 0.512
the 0.546
and 0.555
this_is_a_url 0.738
by 0.742
question 0.926
answer 0.978
get 0.986
our 1.534
tickets 1.673
website 1.902
free 2.659
your 3.196
golf 3.609


pred=1 (0.983518) truth=0 
text=top 5 tips of what to do with your tax refund: THIS_IS_A_URL 
Top Terms:
5 0.019
of 0.416
to 0.660
do 0.676
this_is_a_url 0.738
with 1.143
tax 1.223
what 1.235
refund 1.747
top 2.811
your 3.196
tips 3.762


pred=1 (0.97819) truth=0 
text=キャンセル待ちしてたら繰り上がり当選していたのでseoの勉強に来てみた!  #cssnite_lp36 
Top Terms:
seo 4.874


pred=0 (0.972298) truth=1 
text=rt THIS_IS_A_MENTION сложно ждать соблюдение дедлайнов от людей, которые даже в писуар попасть не могут. 
Top Terms:
rt -1.973
this_is_a_mention -1.537


pred=0 (0.972298) truth=1 
text=rt THIS_IS_A_MENTION #goroyals 
Top Terms:
rt -1.973
this_is_a_mention -1.537


pred=0 (0.970759) truth=1 
text=rt THIS_IS_A_MENTION THIS_IS_A_MENTION 😘👋 
Top Terms:
rt -1.973
this_is_a_mention -1.537


pred=0 (0.974109) truth=1 
text=THIS_IS_A_MENTION hhhhhhhhhhhhhh ah 
Top Terms:
ah -2.260
this_is_a_mention -1.537


pred=0 (0.970759) truth=1 
text=rt THIS_IS_A_MENTION THIS_IS_A_MENTION 858353158 
Top Terms:
rt -1.973
this_is_a_mention -1.537


pred=0 (0.974435) truth=1 
text=THIS_IS_A_MENTION economía del cariño, seamos generosos con el consumidor 
Top Terms:
o -2.812
this_is_a_mention -1.537
el -1.195
con -0.733
del -0.462
a -0.204


pred=0 (0.976097) truth=1 
text=rt THIS_IS_A_MENTION el dividendo de hoy de endesa cuando ayer se anunciaba nuevo déficit de tarifa de 1.275 m€ es tan obsceno que hasta los moderad… 
Top Terms:
que -2.025
rt -1.973
se -1.613
es -1.541
d -1.538
this_is_a_mention -1.537
de -1.259
el -1.195
tan -1.083
1 -0.839
cuando -0.815
hoy -0.734
ayer -0.597
m -0.373
hasta -0.045
tarifa 0.138
los 0.227
275 0.447
nuevo 0.749


pred=0 (0.972298) truth=1 
text=rt THIS_IS_A_MENTION το μεγαλείο της ελληνικής γλώσσας διαφαίνεται στις σύνθετες λέξεις, οι οποίες προσδίδουν άλλη διάσταση σε μια απλή λέξη. … 
Top Terms:
rt -1.973
this_is_a_mention -1.537


pred=0 (0.988577) truth=1 
text=#nowplaying #blackops2 THIS_IS_A_MENTION THIS_IS_A_MENTION THIS_IS_A_MENTION 
Top Terms:
nowplaying -3.153
this_is_a_mention -1.537


pred=0 (0.972298) truth=1 
text=rt THIS_IS_A_MENTION このツイートが千リツイートされたら、ココナラで販売しよう♪ 
Top Terms:
rt -1.973
this_is_a_mention -1.537


pred=1 (0.97759) truth=0 
text=ffシリーズの面白さはグラフィックの綺麗さで、ゲーム性やストーリーじゃないからね。 
Top Terms:
ff 4.846


pred=1 (0.973571) truth=0 
text=3 ways to build an instant social media following for your company: THIS_IS_A_URL my latest post for THIS_IS_A_MENTION 
Top Terms:
this_is_a_mention -1.537
my -0.502
3 -0.285
an -0.005
company 0.139
to 0.660
this_is_a_url 0.738
media 0.934
build 1.184
for 1.389
post 1.406
latest 1.727
following 1.755
instant 1.813
ways 2.048
social 2.613
your 3.196


pred=0 (0.97033) truth=1 
text=voor het eerst dat ik een film van de coen brothers vind tegenvallen... misschien omdat ik een vliegtuig zat en... THIS_IS_A_URL 
Top Terms:
het -1.396
ik -1.373
een -1.326
de -1.259
voor -1.233
dat -1.145
en -0.765
van -0.529
eerst -0.469
misschien -0.438
vind -0.358
omdat -0.326
vliegtuig -0.224
brothers -0.078
zat -0.010
film 0.657
this_is_a_url 0.738


pred=0 (0.972298) truth=1 
text=rt THIS_IS_A_MENTION львы.занимайтесь тем,что у вас лучше всего получается - ебите другим мозг. 
Top Terms:
rt -1.973
this_is_a_mention -1.537


pred=0 (0.972298) truth=1 
text=rt THIS_IS_A_MENTION crickets. 
Top Terms:
rt -1.973
this_is_a_mention -1.537


pred=0 (0.972298) truth=1 
text=rt THIS_IS_A_MENTION τα καλύτερα στη ζωή περνάνε λέγοντας «είναι πολύ νωρίς», και μετά «είναι πολύ αργά»... 
Top Terms:
rt -1.973
this_is_a_mention -1.537


pred=0 (0.972298) truth=1 
text=rt THIS_IS_A_MENTION спонсор моей груди - гейб ньюэлл. уже и не надеюсь на третий. 
Top Terms:
rt -1.973
this_is_a_mention -1.537


pred=0 (0.972298) truth=1 
text=rt THIS_IS_A_MENTION δεν μπορώ να αποφασίσω αν αλλάζει πιο πολλά άβαταρ η σοκο σοκο ή ονόματα ο παπαβανεσας. 
Top Terms:
rt -1.973
this_is_a_mention -1.537


pred=0 (0.970759) truth=1 
text=rt THIS_IS_A_MENTION THIS_IS_A_MENTION κανένας φάκελος δεν εχει εξαφανιςθεί. η διαδικασία γίνεται κανονικά στην βουλή. κάποιοι νομίζουν θα γλυτώσουν ρίχ… 
Top Terms:
rt -1.973
this_is_a_mention -1.537


pred=0 (0.986995) truth=1 
text=rt THIS_IS_A_MENTION o poeta é um fingidor.finge tão completamente,que chega a fingir que é dor a dor que deveras sente.(fernando pessoa) htt… 
Top Terms:
o -2.812
que -2.025
rt -1.973
um -1.755
this_is_a_mention -1.537
t -1.091
chega -0.497
pessoa -0.481
fernando -0.323
a -0.204
sente -0.201
htt -0.010


pred=1 (0.972347) truth=0 
text=my latest post: what to know about seo services: search engine optimization (seo) is a technique used by diffe... THIS_IS_A_URL 
Top Terms:
technique -0.685
my -0.502
used -0.491
a -0.204
about 0.210
is 0.355
know 0.383
to 0.660
this_is_a_url 0.738
by 0.742
services 0.931
search 1.084
what 1.235
optimization 1.383
post 1.406
latest 1.727
engine 2.183
seo 4.874


pred=0 (0.987015) truth=1 
text=rt THIS_IS_A_MENTION s/o ta THIS_IS_A_MENTION 
Top Terms:
o -2.812
rt -1.973
this_is_a_mention -1.537
ta -1.488
s 0.178


pred=0 (0.972298) truth=1 
text=rt THIS_IS_A_MENTION εβγαλαν τις ελληνικες σημαίες στα μπαλκόνια. εβγαλα μια σημαια της αλφα μπανκ. #swste_tis_trapezes 
Top Terms:
rt -1.973
this_is_a_mention -1.537


pred=0 (0.972298) truth=1 
text=rt THIS_IS_A_MENTION θα γυρίσουν τα ρολόγια και θα κοιμηθούμε μια ώρα παραπάνω μας είπαν. και θα ξυπνήσουμε πιο ξεκούραστοι και πιο κεφάτοι μας … 
Top Terms:
rt -1.973
this_is_a_mention -1.537


pred=0 (0.970759) truth=1 
text=rt THIS_IS_A_MENTION holla💰🙋 THIS_IS_A_MENTION 
Top Terms:
rt -1.973
this_is_a_mention -1.537


pred=0 (0.974109) truth=1 
text=THIS_IS_A_MENTION ah... 
Top Terms:
ah -2.260
this_is_a_mention -1.537


pred=0 (0.972298) truth=1 
text=rt THIS_IS_A_MENTION οι άνθρωποι με χιούμορ είναι οι πιο σοβαροί άνθρωποι που έχω γνωρίσει 
Top Terms:
rt -1.973
this_is_a_mention -1.537


pred=0 (0.982248) truth=1 
text=THIS_IS_A_MENTION muy de acuerdo. todos tenemos que mejorar en potenciar que falten el respeto a nuestras ideas. y buenos días! 
Top Terms:
que -2.025
muy -1.564
d -1.538
this_is_a_mention -1.537
de -1.259
buenos -1.221
el -1.195
en -0.765
y -0.600
tenemos -0.461
todos -0.382
acuerdo -0.310
a -0.204
as -0.127
ideas 0.833


pred=1 (0.971494) truth=0 
text=top 5 keys to success on youtube - new blog post! 
THIS_IS_A_URL 
Top Terms:
5 0.019
keys 0.527
to 0.660
this_is_a_url 0.738
on 0.782
blog 1.322
post 1.406
youtube 1.476
new 1.798
top 2.811
success 3.566


pred=0 (0.985789) truth=1 
text=σφαίρα "THIS_IS_A_MENTION THIS_IS_A_MENTION μάλλον δεν μπόρεσε!κλαίω!σε λίγο θα κάνω κροκέτες μπακαλιάρου.έρχεσαι;;o)))" 
Top Terms:
o -2.812
this_is_a_mention -1.537


pred=0 (0.985144) truth=1 
text=15. pois o altíssimo, o santo deus, o deus que vive para sempre, diz: "eu moro num lugar alto e sagrado, mas moro... THIS_IS_A_URL 
Top Terms:
o -2.812
que -2.025
mas -1.879
num -1.486
eu -1.347
sempre -1.142
e -1.090
lugar -0.702
pois -0.646
santo -0.557
diz -0.355
deus -0.265
vive -0.254
alt -0.141
15 -0.112
alto -0.101
ssimo 0.143
para 0.246
this_is_a_url 0.738


pred=1 (0.985617) truth=0 
text=8 tips for planning your small business' 2014 marketing strategy THIS_IS_A_URL 
Top Terms:
8 -0.136
2014 -0.039
planning 0.565
small 0.610
this_is_a_url 0.738
for 1.389
strategy 1.684
your 3.196
business 3.338
marketing 3.655
tips 3.762


pred=0 (0.975163) truth=1 
text=rt THIS_IS_A_MENTION “THIS_IS_A_MENTION o a pie rt THIS_IS_A_MENTION vayan a votar en cleta. el tráfico es un desastre.” fui a pie. debí llevar gorra: so… 
Top Terms:
o -2.812
rt -1.973
es -1.541
this_is_a_mention -1.537
un -1.327
pie -1.316
el -1.195
tr -0.834
en -0.765
fui -0.701
so -0.545
llevar -0.295
a -0.204
fico -0.006
deb 0.651
votar 0.809


pred=0 (0.970397) truth=1 
text=rt THIS_IS_A_MENTION THIS_IS_A_MENTION #nowplaying new THIS_IS_A_MENTION THIS_IS_A_MENTION #sundaynightdrama #streets 
Top Terms:
nowplaying -3.153
rt -1.973
this_is_a_mention -1.537
streets -0.572
new 1.798


pred=0 (0.972298) truth=1 
text=rt THIS_IS_A_MENTION μοναξιά δεν είναι νάσαι μόνος σου στο σπίτι. μοναξιά είναι να μην έχεις επιλογή 
Top Terms:
rt -1.973
this_is_a_mention -1.537


pred=0 (0.970428) truth=1 
text=rt THIS_IS_A_MENTION d̶r̶o̶p̶b̶o̶x̶
̶s̶n̶a̶p̶c̶h̶a̶t̶
̶g̶o̶o̶g̶l̶e̶
̶e̶v̶e̶r̶y̶ ̶a̶p̶p̶ ̶l̶a̶u̶n̶c̶h̶e̶r̶ ̶e̶v̶e̶r̶ 
Top Terms:
o -2.812
rt -1.973
d -1.538
this_is_a_mention -1.537
t -1.091
e -1.090
p -0.905
c -0.724
l -0.693
y -0.600
a -0.204
n -0.196
r -0.183
g -0.168
v -0.083
h 0.066
b 0.074
s 0.178
x 0.476
u 1.570


pred=0 (0.973491) truth=1 
text=photoset:  THIS_IS_A_URL 
Top Terms:
photoset -2.726
this_is_a_url 0.738


pred=0 (0.970193) truth=1 
text=+1 rt THIS_IS_A_MENTION lo mejor del #tbmbcn van a ser los blogueros.conocerles es la única sorpresa que realmente estoy buscando en el evento. 
Top Terms:
que -2.025
rt -1.973
es -1.541
this_is_a_mention -1.537
el -1.195
1 -0.839
nica -0.825
en -0.765
lo -0.665
realmente -0.601
ser -0.552
van -0.529
del -0.462
evento -0.410
la -0.292
mejor -0.255
a -0.204
estoy -0.190
buscando -0.067
sorpresa 0.080
los 0.227


pred=0 (0.970759) truth=1 
text=rt THIS_IS_A_MENTION THIS_IS_A_MENTION おはようございます。「ご予約のトレメンスさま~!」と…誘導されました… 
Top Terms:
rt -1.973
this_is_a_mention -1.537


pred=0 (0.972298) truth=1 
text=rt THIS_IS_A_MENTION malammm {} 
Top Terms:
rt -1.973
this_is_a_mention -1.537


pred=1 (0.970383) truth=0 
text=check out this video for empower network: THIS_IS_A_URL 
Top Terms:
this 0.538
this_is_a_url 0.738
out 1.108
for 1.389
empower 1.557
network 1.790
video 2.933
check 2.979


pred=1 (0.98277) truth=0 
text=top 50 seo tips &amp; tricks for e-commerce websites THIS_IS_A_URL THIS_IS_A_URL 
Top Terms:
e -1.090
50 0.334
tricks 0.695
this_is_a_url 0.738
amp 0.962
websites 1.200
for 1.389
commerce 1.508
top 2.811
tips 3.762
seo 4.874


pred=0 (0.972298) truth=1 
text=rt THIS_IS_A_MENTION фраза дня: летающему слону можешь не отвечать 
Top Terms:
rt -1.973
this_is_a_mention -1.537


pred=0 (0.976154) truth=1 
text=torta bis! é tão gostosa que não tem como comer só um pedaço :v THIS_IS_A_URL 
Top Terms:
o -2.812
que -2.025
um -1.755
como -1.671
t -1.091
tem -0.708
comer -0.360
n -0.196
v -0.083
s 0.178
torta 0.524
bis 0.674
this_is_a_url 0.738


pred=0 (0.972868) truth=1 
text=rt THIS_IS_A_MENTION el de abajo acaba de twittear una pendejada, como siempre. 
Top Terms:
rt -1.973
como -1.671
this_is_a_mention -1.537
una -1.266
de -1.259
el -1.195
siempre -0.702
acaba -0.529
abajo 0.119


pred=0 (0.97104) truth=1 
text=THIS_IS_A_MENTION me ha parecido muy interesante, como siempre!!! ^_^ 
Top Terms:
como -1.671
_ -1.647
muy -1.564
this_is_a_mention -1.537
interesante -0.893
me -0.858
siempre -0.702
parecido -0.233
ha 0.605


pred=1 (0.97819) truth=0 
text=seo対策で、トップページ以外のサイトタイトルもいじりたいと言われると、カスタムフィールド増やさないとならんね。通常はブログ名=施設・店舗名という前提で構築するので… 
Top Terms:
seo 4.874


pred=0 (0.972298) truth=1 
text=rt THIS_IS_A_MENTION καις που καις φέρετρο έξω από τον σκαϊ, δεν πετάς και τον πορτοσάλτε μέσα; 
Top Terms:
rt -1.973
this_is_a_mention -1.537


pred=0 (0.973491) truth=1 
text=photoset:  THIS_IS_A_URL 
Top Terms:
photoset -2.726
this_is_a_url 0.738


pred=0 (0.971386) truth=1 
text=¿qué pasa si grabas con un gps durante un año todos los trayectos por la ciudad en la que vives? THIS_IS_A_URL  vía THIS_IS_A_MENTION 
Top Terms:
o -2.812
que -2.025
this_is_a_mention -1.537
un -1.327
qu -1.214
en -0.765
pasa -0.742
con -0.733
si -0.615
todos -0.382
ciudad -0.332
por -0.321
la -0.292
a -0.204
durante -0.197
v -0.083
los 0.227
gps 0.380
this_is_a_url 0.738


pred=1 (0.974734) truth=0 
text=we're building homes for millions of homeless families online and we need your help. join our team THIS_IS_A_URL 
Top Terms:
re -0.346
need -0.008
team 0.145
of 0.416
families 0.536
and 0.555
this_is_a_url 0.738
millions 1.126
homes 1.194
we 1.284
help 1.377
for 1.389
our 1.534
building 1.616
join 1.772
homeless 2.034
online 2.335
your 3.196


pred=1 (0.982698) truth=0 
text=tips to help you manage your business finances better | gabriel terrell's empower network blog THIS_IS_A_URL 
Top Terms:
gabriel -0.235
s 0.178
to 0.660
this_is_a_url 0.738
manage 0.931
finances 1.008
better 1.070
blog 1.322
help 1.377
empower 1.557
network 1.790
you 1.896
your 3.196
business 3.338
tips 3.762


pred=1 (0.979819) truth=0 
text=10 ways to improve your content marketing THIS_IS_A_URL 
Top Terms:
10 -0.516
to 0.660
this_is_a_url 0.738
improve 1.685
ways 2.048
content 2.372
your 3.196
marketing 3.655


pred=0 (0.972298) truth=1 
text=そして水中深く沈んでゆくのだった rt: THIS_IS_A_MENTION rt THIS_IS_A_MENTION 「苦しいときは上り坂」 壁に貼ろう!! 
Top Terms:
rt -1.973
this_is_a_mention -1.537


pred=0 (0.972298) truth=1 
text=rt THIS_IS_A_MENTION я часто видел, как служители и бизнесмены молятся и молятся, чтобы бог послал им хорошие возможности. 
Top Terms:
rt -1.973
this_is_a_mention -1.537


pred=1 (0.986832) truth=0 
text=how to use email marketing to generate more sales for your business THIS_IS_A_URL 
Top Terms:
use 0.173
email 0.470
generate 0.624
to 0.660
this_is_a_url 0.738
more 1.251
for 1.389
how 1.418
sales 2.981
your 3.196
business 3.338
marketing 3.655


pred=0 (0.970733) truth=1 
text=señores #tontos en todas partes. pregunten antes de quejarse/acusar que les han hackeado!!! 
Top Terms:
que -2.025
se -1.613
les -1.599
de -1.259
han -0.993
en -0.765
todas -0.567
partes -0.360
ores -0.175
antes -0.100


pred=0 (0.970759) truth=1 
text=rt THIS_IS_A_MENTION “THIS_IS_A_MENTION успешные люди чаще терпят неудачи! терпите неудачи, учитесь, восстанавливайтесь, ... #retryretryretry … 
Top Terms:
rt -1.973
this_is_a_mention -1.537


pred=1 (0.986412) truth=0 
text=great social media tips with personality. check out 'social media for business' from THIS_IS_A_MENTION THIS_IS_A_MENTION THIS_IS_A_URL 
Top Terms:
this_is_a_mention -1.537
from 0.542
personality 0.581
this_is_a_url 0.738
media 0.934
out 1.108
with 1.143
for 1.389
great 2.055
social 2.613
check 2.979
business 3.338
tips 3.762


pred=0 (0.972298) truth=1 
text=rt THIS_IS_A_MENTION είναι γνωστό ότι τον τελευταίο καιρό δεν έχω κέφια αλλά κ από εσάς δεν βλέπω μεγάλες προθυμίες.. τι σας συμβαίνει; πείτε μο… 
Top Terms:
rt -1.973
this_is_a_mention -1.537


pred=0 (0.971456) truth=1 
text=vía THIS_IS_A_MENTION es con el hierro, no con el oro, con lo que se libera la patria THIS_IS_A_URL 
Top Terms:
que -2.025
se -1.613
es -1.541
this_is_a_mention -1.537
el -1.195
con -0.733
lo -0.665
no -0.382
la -0.292
a -0.204
v -0.083
this_is_a_url 0.738


pred=0 (0.972298) truth=1 
text=rt THIS_IS_A_MENTION εσεις που κορναρετε πριν περάσετε ένα στενο, να φανταστω οταν γαματε φωναζετε"μπαινωωωω". 
Top Terms:
rt -1.973
this_is_a_mention -1.537


pred=0 (0.972298) truth=1 
text=rt THIS_IS_A_MENTION έχει πάθει κάτι το τιτιβείο; δεν μπορώ να κάνω φαβ, η λίστα είναι άδεια.. μαζέψατε υπογραφές να με διώξετε αλήτες; 
Top Terms:
rt -1.973
this_is_a_mention -1.537


pred=0 (0.981456) truth=1 
text=acabo de poner al día mi readitlater... necesito el cuarto de goku en el que una hora es un año por favor... 
Top Terms:
o -2.812
que -2.025
es -1.541
d -1.538
un -1.327
una -1.266
de -1.259
el -1.195
en -0.765
necesito -0.634
al -0.415
favor -0.383
por -0.321
a -0.204
hora -0.146
poner -0.127
acabo -0.094
mi 0.370
cuarto 0.373


pred=0 (0.972298) truth=1 
text=rt THIS_IS_A_MENTION αν είχαν χιούμορ οι διμοιρίτες στην παρέλαση,θα σήκωναν το χέρι ψηλά κ κατεβάζοντας το μπροστά στους επίσημους θα έδειχναν στα… 
Top Terms:
rt -1.973
this_is_a_mention -1.537


pred=0 (0.970759) truth=1 
text=rt THIS_IS_A_MENTION THIS_IS_A_MENTION 321192379 
Top Terms:
rt -1.973
this_is_a_mention -1.537


pred=0 (0.983158) truth=1 
text=rt THIS_IS_A_MENTION el sabado necesitamos que nos traigas al THIS_IS_A_MENTION pantalones para niños de 2 a 12 años.
pon tu granito de arena ht… 
Top Terms:
que -2.025
rt -1.973
os -1.577
this_is_a_mention -1.537
de -1.259
ht -1.233
ni -1.214
el -1.195
nos -0.882
2 -0.804
al -0.415
tu -0.400
a -0.204
12 -0.199
pon -0.157
necesitamos -0.134
arena 0.088
para 0.246


pred=1 (0.978741) truth=0 
text=rt THIS_IS_A_MENTION work in sales? our social media academy will teach you how to generate free leads using linkedin &amp; twitter THIS_IS_A_URL 
Top Terms:
rt -1.973
this_is_a_mention -1.537
work -0.277
twitter -0.267
in 0.112
will 0.541
generate 0.624
to 0.660
this_is_a_url 0.738
using 0.922
academy 0.925
media 0.934
amp 0.962
teach 1.202
how 1.418
leads 1.454
our 1.534
you 1.896
linkedin 2.377
social 2.613
free 2.659
sales 2.981


pred=0 (0.970946) truth=1 
text=rt THIS_IS_A_MENTION si cuando desperté me dolía todo, ahora me duele mucho mas x___x 
Top Terms:
rt -1.973
mas -1.879
this_is_a_mention -1.537
todo -1.349
me -0.858
cuando -0.815
si -0.615
mucho -0.469
a -0.204
ahora 0.400


pred=0 (0.972298) truth=1 
text=rt THIS_IS_A_MENTION #persibday :) 
Top Terms:
rt -1.973
this_is_a_mention -1.537


pred=0 (0.97464) truth=1 
text=aprenda como fazer um kibe de forno igual o da vovo. vem ver que fácil!  THIS_IS_A_URL 
Top Terms:
o -2.812
que -2.025
um -1.755
como -1.671
ver -1.556
de -1.259
da -1.165
igual -0.554
aprenda -0.222
f -0.023
fazer 0.092
vem 0.283
cil 0.427
this_is_a_url 0.738


pred=0 (0.97103) truth=1 
text=:o THIS_IS_A_URL 
Top Terms:
o -2.812
this_is_a_url 0.738


pred=0 (0.970759) truth=1 
text=THIS_IS_A_MENTION que padel? 
Top Terms:
que -2.025
this_is_a_mention -1.537


pred=1 (0.976135) truth=0 
text=why budgeting is very important in affiliate marketing - imgrind THIS_IS_A_URL 
Top Terms:
in 0.112
very 0.173
is 0.355
this_is_a_url 0.738
why 0.837
important 1.752
marketing 3.655
affiliate 4.092


pred=0 (0.971605) truth=1 
text=e esse cupcake maravilhoso? não dá pra resistir! THIS_IS_A_URL 
Top Terms:
o -2.812
d -1.538
e -1.090
pra -0.960
esse -0.824
cupcake -0.374
n -0.196
this_is_a_url 0.738


pred=0 (0.981853) truth=1 
text=sharepoint: THIS_IS_A_URL 
Top Terms:
sharepoint -3.098
this_is_a_url 0.738


pred=0 (0.970651) truth=1 
text=rt THIS_IS_A_MENTION THIS_IS_A_MENTION THIS_IS_A_MENTION THIS_IS_A_MENTION háganme el favor, ¿un congresista rogándole a un inquilino que salga de su casa? 
Top Terms:
que -2.025
rt -1.973
this_is_a_mention -1.537
un -1.327
de -1.259
el -1.195
favor -0.383
ndole -0.231
a -0.204
casa -0.098
su -0.061
h 0.066


pred=0 (0.981815) truth=1 
text=casi me bajo en la estación de füzesabony, que está a 19 km de eger, eso por dormirme y despertar en una estación grande... 
Top Terms:
km -4.036
que -2.025
una -1.266
de -1.259
eso -1.195
me -0.858
en -0.765
grande -0.701
y -0.600
estaci -0.476
casi -0.328
por -0.321
la -0.292
19 -0.225
a -0.204
n -0.196
est -0.059
f -0.023
bajo 0.042


pred=0 (0.980673) truth=1 
text=THIS_IS_A_MENTION baaaaah que lo llevabas encedido en el bus ^_^ 
Top Terms:
que -2.025
_ -1.647
this_is_a_mention -1.537
el -1.195
en -0.765
lo -0.665
bus -0.647


pred=0 (0.972298) truth=1 
text=rt THIS_IS_A_MENTION - чем занималась сегодня?
- у меня есть твиттер и инстаграм посмотри туда пиздец ты деревня. 
Top Terms:
rt -1.973
this_is_a_mention -1.537


pred=0 (0.972298) truth=1 
text=rt THIS_IS_A_MENTION πάντως με μισό μπουκάλι μεταξά λες ναι σε πράγματα που έπρεπε να βροντοφωναξεις όχι 
Top Terms:
rt -1.973
this_is_a_mention -1.537


pred=0 (0.98835) truth=1 
text=THIS_IS_A_MENTION doblarse se dobla, siempre que te sientes encima.... no es el único al que le pasa ;) 
Top Terms:
que -2.025
le -1.794
se -1.613
es -1.541
this_is_a_mention -1.537
el -1.195
te -0.912
pasa -0.742
siempre -0.702
al -0.415
no -0.382
encima -0.360
nico -0.280


pred=1 (0.979415) truth=0 
text=my latest post: seo london – tel: 02033183249: THIS_IS_A_URL seo search engine optimisati... THIS_IS_A_URL 
Top Terms:
my -0.502
tel -0.294
this_is_a_url 0.738
search 1.084
london 1.233
post 1.406
latest 1.727
engine 2.183
seo 4.874


pred=0 (0.970759) truth=1 
text=rt THIS_IS_A_MENTION THIS_IS_A_MENTION うお!そういえば昨日八王子の居酒屋のノリのいい女店員にステッカーあげとき… 
Top Terms:
rt -1.973
this_is_a_mention -1.537


pred=0 (0.976663) truth=1 
text=“THIS_IS_A_MENTION ser dj es fácil, lo difícil es que tus padres no se avergüencen de ti cuando les preguntan a qué te dedicas.” 
Top Terms:
que -2.025
se -1.613
les -1.599
es -1.541
this_is_a_mention -1.537
de -1.259
qu -1.214
te -0.912
cuando -0.815
dif -0.702
lo -0.665
ser -0.552
padres -0.453
ti -0.442
tus -0.397
no -0.382
a -0.204
f -0.023
cil 0.427
dj 0.522


pred=0 (0.972298) truth=1 
text=rt THIS_IS_A_MENTION να συνεργαστεί ο τέως με τους ανεξάρτητους και να πουν το κόμμα κοκός ανελ. 
Top Terms:
rt -1.973
this_is_a_mention -1.537


pred=0 (0.972298) truth=1 
text=rt THIS_IS_A_MENTION desenrascanco (португальский) — искусство выкручиваться из трудного положения, не имея ни плана, ни денег 
Top Terms:
rt -1.973
this_is_a_mention -1.537


pred=0 (0.973523) truth=1 
text=THIS_IS_A_MENTION de acuerdo. vamos por eso! seamos responsables y criticos. la clase politica es una mofa... 
Top Terms:
es -1.541
this_is_a_mention -1.537
una -1.266
de -1.259
eso -1.195
vamos -0.757
clase -0.683
y -0.600
por -0.321
acuerdo -0.310
la -0.292


pred=0 (0.985851) truth=1 
text=rt THIS_IS_A_MENTION muy buena! THIS_IS_A_URL rt THIS_IS_A_MENTION THIS_IS_A_MENTION q paja que te gustó! es demasiado #filin esa campaña … 
Top Terms:
que -2.025
rt -1.973
muy -1.564
es -1.541
this_is_a_mention -1.537
esa -1.075
te -0.912
gust -0.790
q -0.693
demasiado -0.539
campa -0.283
a -0.204
buena -0.060
this_is_a_url 0.738


pred=0 (0.974638) truth=1 
text=rt THIS_IS_A_MENTION tiene huevos que susana díaz hable de cesarismo en podemos. y no quiero hablar de felipe. ¡ah, no! felipe era dios. 
Top Terms:
ah -2.260
que -2.025
rt -1.973
d -1.538
this_is_a_mention -1.537
de -1.259
quiero -1.030
az -0.908
en -0.765
era -0.762
y -0.600
felipe -0.519
no -0.382
hablar -0.274
dios -0.175
tiene -0.059
huevos 0.125
podemos 0.231


pred=1 (0.984298) truth=0 
text=obama to discuss is with europeans: us president obama is to hold a video conference with european leaders to ... THIS_IS_A_URL 
Top Terms:
a -0.204
is 0.355
discuss 0.422
hold 0.451
to 0.660
this_is_a_url 0.738
european 0.843
conference 1.139
with 1.143
leaders 1.229
president 1.501
us 1.801
video 2.933
obama 4.471


pred=0 (0.970759) truth=1 
text=rt THIS_IS_A_MENTION THIS_IS_A_MENTION 広島の宮島ですよね? 福山に友人がいるので一度。ありがとうございます。 
Top Terms:
rt -1.973
this_is_a_mention -1.537


In [38]:
print "Sentence distribution", 1.* y_sent.sum()/y_sent.shape[0]
print y_sent.shape[0]


Sentence distribution 0.460468123626
309320

In [63]:
order = proba.argmax(axis=1)
maxprob= proba.min(axis=1)
plt.hist([maxprob[order==0], maxprob[order==1]], bins=np.arange(0.0, .5,.1), label=['y=0', 'y=1'])
plt.legend(loc='best')


Out[63]:
<matplotlib.legend.Legend at 0x112767410>

In [ ]: