In [2]:
import json
# data_path = 'C:/Users/mramire8/Documents/Datasets/twitter'
data_path = '../../data/twitter'
def get_tweets_file(path):
f = open(path)
i = 0
users = []
for line in f:
data = line.split("]][[")
last = len(data)
for i,tweets in enumerate(data):
if i == 0:
t = json.loads(tweets[1:] + "]")
elif i == (last-1):
t = json.loads("["+tweets[:-1])
else:
t = json.loads("["+tweets+"]")
users.append(t)
return users
good = get_tweets_file(data_path + "/good.json")
print "Real users %s" % (len(good))
bots = get_tweets_file(data_path + "/bots.json")
print "Bot users %s" % (len(bots))
In [2]:
print "total: ", 883+898
print "\n".join(sorted(good[0][0].keys()))
In [3]:
print "\n".join(sorted(good[0][0]['user'].keys()))
To avoid bias of the classifier by trending topics the data should belong to the same date range. We want to make sure the users belong to the same period of time.
For example, the classifier can learn the topics instead of distinguishing bots from real users. We check for the date distribution of both classes.
In [3]:
import datetime
from collections import Counter
def get_date(date_str):
return datetime.datetime.strptime(date_str.strip('"'), "%a %b %d %H:%M:%S +0000 %Y")
# datetime.strptime((r.json()[x]["created_at"]).strip('"'), "%a %b %d %H:%M:%S +0000 %Y")
def count_dates(users):
dates = Counter()
min_dt = get_date(users[0][0]['created_at'])
for user in users:
d = get_date(user[0]['created_at'])
min_dt = min(min_dt, d)
dates.update([d])
return dates, min_dt
good_counts, min_good = count_dates(good)
print "Most common: %s" % good_counts.most_common(3)
print "Latest: %s" % min_good
bots_counts, min_bots = count_dates(bots)
print "Most common: %s" % bots_counts.most_common(3)
print "Latest: %s" % min_bots
In [4]:
## Number of users that have old tweets in good users
## with tweets not in 2014
print "Old good users %s" % len([d for d in good_counts.keys() if d.year < 2014])
print "Old bot users %s" % len([d for d in bots_counts.keys() if d.year < 2014])
In [36]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
mpl.style.use('fivethirtyeight')
years = mdates.YearLocator() # every year
months = mdates.MonthLocator() # every month
yearsFmt = mdates.DateFormatter('%Y')
monthsFmt = mdates.DateFormatter('%Y-%m')
fig = plt.figure(figsize=(7,7))
ax = plt.axes()
gds =[(d,c) for d,c in good_counts.iteritems() if d.year > 2013]
bts =[(d,c) for d,c in bots_counts.iteritems() if d.year > 2013]
kg = [d.toordinal() for d,_ in gds]
kb = [d.toordinal() for d,_ in bts]
wg = [c for _,c in gds]
wb = [c for _,c in bts]
plt.hist([kg,kb], weights=[wg,wb], bins=20, stacked=False, alpha=.7, label=['real', 'bots'])
ax.xaxis.set_major_locator(months)
ax.xaxis.set_major_formatter(monthsFmt)
# ax.xaxis.set_minor_locator(months)
plt.legend(loc='best', frameon=False, numpoints=1)
ax.format_xdata = mdates.DateFormatter('%Y-%m-%d')
fig.autofmt_xdate()
plt.savefig("date_dist.png")
For each user we have tweet in the timeline. Let $s=\{t | t \text{ is a tweet}\}_{1}^{200}$ be the tweets of a user. Let $D=\{(s,y)\}_1^n$, where $y=\{'human','bot'\}$.
For every user we process the text of the twee as follows:
We create a dataset dictionary containing:
In [6]:
## convert the tweet into a data format of text documents
# from sklearn.datasets.base import Bunch
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
def preprocess(string, lowercase, collapse_urls, collapse_mentions):
if not string:
return ""
if lowercase:
string = string.lower()
# tokens = []
if collapse_urls:
string = re.sub('http\S+', 'THIS_IS_A_URL', string)
if collapse_mentions:
string = re.sub('@\S+', 'THIS_IS_A_MENTION', string)
# if prefix:
# tokens = ['%s%s' % (prefix, t) for t in tokens]
return string
def timeline_to_doc(user, lowercase, collapse_urls, collapse_mentions):
tweets = []
for tw in user:
tweets.append(preprocess(tw['text'], lowercase, collapse_urls, collapse_mentions))
return tweets
def user_to_doc(users, lowercase, collapse_urls, collapse_mentions):
timeline = []
user_names = []
user_id = []
for user in users:
timeline.append(timeline_to_doc(user, lowercase, collapse_urls, collapse_mentions))
user_names.append(user[0]['user']['name'])
user_id.append(user[0]['user']['screen_name'])
return user_id, user_names, timeline
def bunch_users(class1, class2, vct, lowercase, collapse_urls, collapse_mentions, labels=None):
labels = None
if labels is None:
labels = [0,1]
user_id, user_names, timeline = user_to_doc(class1, lowercase, collapse_urls, collapse_mentions)
user_id2, user_names2, timeline2 = user_to_doc(class2, lowercase, collapse_urls, collapse_mentions)
target = [labels[0]] * len(user_id)
user_id.extend(user_id2)
user_names.extend(user_names2)
timeline.extend(timeline2)
target.extend([labels[1]]* len(user_id2))
user_text = [". ".join(t) for t in timeline]
# data = Bunch(data=timeline, target=target, user_id=user_id, user_name=user_names)
data = {'data':timeline, 'target':np.array(target), 'user_id':user_id, 'user_name':user_names, 'user_text':user_text}
data['bow'] = vct.fit_transform(data['user_text'])
random_state = np.random.RandomState(5612)
indices = np.arange(data['bow'].shape[0])
random_state.shuffle(indices)
data['target'] = np.array(data['target'])[indices]
data_lst = np.array(data['data'] , dtype=object)
data_lst = data_lst[indices]
data['data'] = data_lst.tolist()
data['bow'] = data['bow'][indices]
data['user_id'] = np.array(data['user_id'])[indices]
data['user_name'] = np.array(data['user_id'])[indices]
data['user_text'] = np.array(data['user_id'])[indices]
data['target_names'] = labels
return data
In [17]:
import numpy as np
vct = TfidfVectorizer(encoding='latin1', min_df=5, max_df=1.0, binary=False, ngram_range=(1, 1),
token_pattern='\\b\\w+\\b')
gds =[g for g in good if get_date(g[0]['created_at']).year > 2013]
bts =[b for b in bots if get_date(b[0]['created_at']).year > 2013]
data = bunch_users(gds,bts, vct, True, True, True, labels=['good', 'bots'])
print "Total data:", len(data['target'])
In [18]:
import sys
import os
sys.path.append(os.path.abspath("/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages"))
from sklearn.learning_curve import learning_curve
from sklearn import linear_model
from sklearn.naive_bayes import MultinomialNB
import sklearn.metrics as metrics
# import brewer2mpl
from sklearn.cross_validation import StratifiedKFold, cross_val_score, KFold, ShuffleSplit
import itertools
def get_tableau():
tableau20 = [(31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120),
(44, 160, 44), (152, 223, 138), (214, 39, 40), (255, 152, 150),
(148, 103, 189), (197, 176, 213), (140, 86, 75), (196, 156, 148),
(227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199),
(188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)]
# Scale the RGB values to the [0, 1] range, which is the format matplotlib accepts.
for i in range(len(tableau20)):
r, g, b = tableau20[i]
tableau20[i] = (r / 255., g / 255., b / 255.)
return tableau20
def learning_curve_tweet(data,clf, sizes=None, curve_label=None):
col = get_tableau()
colors_n = itertools.cycle(col)
random_state = np.random.RandomState(56124)
indices = np.arange(data['bow'].shape[0])
random_state.shuffle(indices)
data['target'] = np.array(data['target'])[indices]
data_lst = np.array(data['data'] , dtype=object)
data_lst = data_lst[indices]
data['data'] = data_lst.tolist()
data['bow'] = data['bow'][indices]
try:
data['user_id'] = np.array(data['user_id'])[indices]
data['user_name'] = np.array(data['user_id'])[indices]
data['user_text'] = np.array(data['user_id'])[indices]
except Exception:
pass
kcv = KFold(len(data['target']), n_folds=5, random_state=random_state,shuffle=True)
scoring_fn = 'accuracy'
# print("Classifier name:", clf.__class__.__name__, "C=", clf.C)
# print("CV data:", data['bow'])
if sizes is None:
sizes = range(20, 4* len(data['target'])/5, 100)
train_sizes, train_scores, test_scores = learning_curve(
clf, data['bow'], data['target'], train_sizes=sizes, cv=5, scoring=scoring_fn, n_jobs=2)
current_color = colors_n.next()
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = 1.0 * np.std(test_scores, axis=1) / np.sqrt(5.0)
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color=current_color)
plt.plot(train_sizes, test_scores_mean, 'o-', mfc='white', linewidth=2, mew=2, markersize=10, mec=current_color, color=current_color,
# label="Cross-validation score")
label="{}".format(curve_label))
print ("-"*40)
print ("\nCOST\tMEAN\tSTDEV")
print ("\n".join(["{0}\t{1:.3f}\t{2:.4f}".format(c,m,s) for c,m,s in zip(train_sizes, test_scores_mean, test_scores_std)]))
plt.legend(loc="best")
# plt.savefig('lr-{0}.png'.format(vct.__class__.__name__), bbox_inches="tight", dpi=200, transparent=True)
plt.savefig('lradapt-sent-sent.png', bbox_inches="tight", dpi=200, transparent=True)
plt.show()
In [10]:
classifier = 'lr'
if classifier == "mnb":
clf = MultinomialNB(alpha=1)
else:
clf = linear_model.LogisticRegression(penalty='l1', C=10)
learning_curve_tweet(data,clf)
The following results show how the preprocessing of text affects the classification
Text Processing We observe that there are not significant in the options. However, the best options are:
This seems to suggest that collapsing mentions helps more than other processing options.
In [11]:
## try all combinations of data
def try_all(clf, vct, good, bots):
# Trying all possible options
lowercase_opts = [True, False]
# keep_punctuation_opts = [True, False]
url_opts = [True, False]
mention_opts = [True, False]
argnames = ['lower', 'url', 'mention']
option_iter = itertools.product( lowercase_opts,
url_opts,
mention_opts)
results = []
for options in option_iter:
print '\t'.join('%s=%s' % (name, opt) for name, opt in zip(argnames, options))
data = bunch_users(good, bots, vct, *options)
cv_scores = cross_val_score(clf, data['bow'], data['target'], cv=5, n_jobs=1)
print("5-f CV Accuracy: %0.2f (+/- %0.2f)" % (cv_scores.mean(), cv_scores.std() * 2))
In [12]:
clf = linear_model.LogisticRegression(penalty='l1', C=10)
vct = TfidfVectorizer(encoding='latin1', min_df=5, max_df=1.0, binary=False, ngram_range=(1,1),
token_pattern='\\b\\w+\\b')
try_all(clf, vct, gds, bts)
In [13]:
## Try by removing stopwords
clf = linear_model.LogisticRegression(penalty='l2', C=10)
vct = TfidfVectorizer(encoding='latin1', min_df=5, max_df=1.0, binary=False, ngram_range=(1,2),
token_pattern='\\b\\w+\\b')
try_all(clf, vct, gds, bts)
In [14]:
## Try by removing stopwords
clf = linear_model.LogisticRegression(penalty='l2', C=10)
vct = TfidfVectorizer(encoding='latin1', min_df=5, max_df=1.0, binary=False, ngram_range=(1,2),
token_pattern='\\b\\w+\\b', stop_words='english')
try_all(clf, vct, gds, bts)
In [15]:
clf = MultinomialNB(alpha=(.54,.46))
vct = TfidfVectorizer(encoding='latin1', min_df=5, max_df=1.0, binary=False, ngram_range=(1,1),
token_pattern='\\b\\w+\\b')
try_all(clf, vct, gds, bts)
In [17]:
clf = linear_model.LogisticRegression(penalty='l2', C=1)
vct = TfidfVectorizer(encoding='latin1', min_df=5, max_df=1.0, binary=False, ngram_range=(1,1),
token_pattern='\\b\\w+\\b')
try_all(clf, vct, gds, bts)
In [18]:
def print_features(coef, names):
""" Print sorted list of non-zero features/weights. """
### coef = clf.coef_[0]
### names = vec.get_feature_names()
print "*" * 50
print("Number of Features: %s" % len(names))
print "\n".join('%s\t%.2f' % (names[j], coef[j]) for j in np.argsort(coef)[::-1] if coef[j] != 0)
print "*" * 50
In [19]:
clf = linear_model.LogisticRegression(penalty='l1', C=10)
data = bunch_users(gds, bts, vct, True, True, True)
clf.fit(data['bow'], data['target'])
print_features(clf.coef_[0], vct.get_feature_names())
For the classifier of choice we look for the best possible paramter to maximize accuracy. For logistic regression we look for the C penalty of regularization.
We found that C=10 seems to work well with the data.
Note: The data only includes users with tweets in the current year.
In [20]:
# Grid search best estimator
from sklearn.cross_validation import StratifiedKFold, train_test_split
from sklearn.metrics import classification_report
from sklearn.grid_search import GridSearchCV
def grid_search_clf(data, clf, tuned_parameters):
import copy
# scores = ['accuracy','precision', 'recall']
measure = 'accuracy'
X_train, X_test, y_train, y_test = train_test_split(data['bow'], data['target'], test_size=0.25, random_state=0)
kcv = StratifiedKFold(y=y_train, n_folds=5, shuffle=True, random_state=546321)
print("# Tuning hyper-parameters for %s" % measure)
print(len(y_train))
clf_new = copy.copy(clf)
clfGS = GridSearchCV(clf_new, tuned_parameters, cv=5, scoring=measure, n_jobs=10, refit=True)
clfGS.fit(X_train, y_train)
print("Best parameters set found on development set:")
print(clfGS.best_estimator_)
print()
print("Grid scores on development set:")
print()
for params, mean_score, scores in clfGS.grid_scores_:
print("%0.3f (+/-%0.03f) for %r"
% (mean_score, scores.std() / 2, params))
print()
print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
y_true, y_pred = y_test, clfGS.predict(X_test)
print(classification_report(y_true, y_pred))
print()
In [22]:
tuned_parameters = [{'C': [pow(10,x) for x in range(-3,4)]}]
grid_search_clf(data, linear_model.LogisticRegression(penalty='l1', C=10), tuned_parameters)
In [23]:
tuned_parameters = [{'C': [pow(10,x) for x in range(-3,4)]}]
grid_search_clf(data, linear_model.LogisticRegression(penalty='l2'), tuned_parameters)
We test the effect of the data representation in the performance of the classifier.
We observe that the performance of the classifier does not increase significantly.
Data Representation
We tried two vectorizers: Count and TFIdf. We combined with unigrams and two-grams.
TFIDF + one-grams and two-grams work better.
In [24]:
vct = TfidfVectorizer(encoding='latin1', min_df=5, max_df=1.0, binary=False, ngram_range=(1,2),
token_pattern='\\b\\w+\\b')
try_all(clf, vct, gds, bts)
In [25]:
vct = TfidfVectorizer(encoding='latin1', min_df=1, max_df=1.0, binary=False, ngram_range=(1,1),
token_pattern='\\b\\w+\\b')
try_all(clf, vct, gds, bts)
In [26]:
vct = CountVectorizer(encoding='latin1', min_df=5, max_df=1.0, binary=False, ngram_range=(1,1),
token_pattern='\\b\\w+\\b')
try_all(clf, vct, gds, bts)
In [19]:
##sentence detector in tweets
def sentence_detector(timeline):
tl = []
print len(timeline)
for tw in timeline:
tl.append(tw)
return tl
def convert2sentence(data):
all_sent=[]
all_target=[]
for user_timeline, label in zip(data['data'], data['target']):
sentences = user_timeline
lbls = [label] * len(sentences)
all_sent.extend(sentences)
all_target.extend(lbls)
return all_sent, all_target
In [20]:
vct = TfidfVectorizer(encoding='latin1', min_df=5, max_df=1.0, binary=False, ngram_range=(1,2),
token_pattern='\\b\\w+\\b')
x_sent,y_sent = convert2sentence(data)
random_state = np.random.RandomState(5612)
indices = np.arange(len(y_sent))
random_state.shuffle(indices)
y_sent = np.array(y_sent)[indices]
data_lst = np.array(x_sent , dtype=object)
data_lst = data_lst[indices]
x_sent = data_lst.tolist()
In [29]:
clf_sent = linear_model.LogisticRegression(penalty='l1', C=10)
print "Total sentences:", len(y_sent)
learning_curve_tweet({'data':x_sent, 'target':y_sent, 'bow':vct.fit_transform(x_sent)},clf, sizes=range(1000, 20000, 1000))
In [37]:
#other classifiers
lr2=linear_model.LogisticRegression(penalty='l2', C=1)
# lr2.fit(data['bow'], data['target'])
learning_curve_tweet({'data':x_sent, 'target':y_sent, 'bow':vct.fit_transform(x_sent)},
lr2, sizes=range(1000, 20000, 1000), curve_label="LRL2-C=1")
In [30]:
#other classifiers
lr2=linear_model.LogisticRegression(penalty='l2', C=10)
# lr2.fit(data['bow'], data['target'])
learning_curve_tweet({'data':x_sent, 'target':y_sent, 'bow':vct.fit_transform(x_sent)},
lr2, sizes=range(1000, 20000, 1000), curve_label="LRL2")
In [31]:
mnb = MultinomialNB(alpha=1)
learning_curve_tweet({'data':x_sent, 'target':y_sent, 'bow':vct.fit_transform(x_sent)},
mnb, sizes=range(1000, 20000, 1000), curve_label="MNB")
In [32]:
data.keys()
Out[32]:
In [69]:
term_doc =vct.inverse_transform(data['bow'])
print term_doc[:1]
In [34]:
print type(term_doc)
print len(term_doc)
print len(data['user_id'])
text = "i love going to the beach beach beach beach"
bottext = vct.transform([text])
print bottext
print bottext.nonzero()
In [62]:
def uniqueness_features(inverse_vector, vct):
unique = len(vct.get_feature_names())
x = []
for d in inverse_vector:
x.append([1. * len(d)/unique])
return np.array(x)
UX = uniqueness_features(term_doc, vct)
print UX.shape
print data['target'].shape
In [64]:
def cross_val(X, y, clf, cv=5):
# data = bunch_users(good, bots, vct, *options)
cv_scores = cross_val_score(clf, X, y, cv=cv, n_jobs=1)
print("5-f CV Accuracy: %0.2f (+/- %0.2f)" % (cv_scores.mean(), cv_scores.std() * 2))
uclf = linear_model.LogisticRegression(penalty='l2', C=1)
cross_val(UX, data['target'], uclf)
print UX[:10], data['target'][:10]
In [68]:
plt.scatter(UX, data['target'], c=data['target'])
plt.show()
In [21]:
y0 = []
y1 = []
for d,l in zip(data['data'],data['target']):
if l == 0:
y0.append(len(d))
else:
y1.append(len(d))
print "Real users:",len(y0)
print "Bot users:",len(y1)
In [22]:
tw_y0 = Counter()
for c in y0:
tw_y0[c] +=1
tw_y1 = Counter()
for c in y1:
tw_y1[c] +=1
In [23]:
print "Most common:", tw_y0.most_common(3)
print "Least common", tw_y0.most_common()[:-3-1:-1]
print "Least tweets:", min(tw_y0.keys())
print np.mean(y0)
In [24]:
print "Most common:", tw_y1.most_common(3)
print "Least common", tw_y1.most_common()[:-3-1:-1]
print "Least tweets:", min(tw_y1.keys())
print np.mean(y1)
In [25]:
from sklearn.cross_validation import train_test_split, ShuffleSplit
from sklearn.metrics import accuracy_score
vct_sent = TfidfVectorizer(encoding='latin1', min_df=5, max_df=1.0, binary=False, ngram_range=(1,1),
token_pattern='\\b\\w+\\b')
x_train, x_test, y_train, y_test = train_test_split(x_sent, y_sent, test_size=0.5, random_state=42)
x_train_bow = vct_sent.fit_transform(x_train)
x_test_bow = vct_sent.transform(x_test)
lrl2=linear_model.LogisticRegression(penalty='l2', C=1)
lrl2.fit(x_train_bow,y_train)
Out[25]:
In [26]:
proba = lrl2.predict_proba(x_test_bow)
pred = lrl2.predict(x_test_bow)
print "Accuracy %f" % accuracy_score(y_test, pred)
In [39]:
def print_top_terms(model, terms, n=20):
print '\nTop Coefficients'
coef = model.coef_[0]
srted = np.argsort(coef)
topi = srted[::-1][:n]
boti = srted[:n]
print 'Real Terms:\n' + '\n'.join('%s (%g)' % (n, c) for n, c in zip(terms[topi], coef[topi]))
print '\nBot Terms:\n' + '\n'.join('%s (%g)' % (n, c) for n, c in zip(terms[boti], coef[boti]))
print '\nintercept=%g' % model.intercept_
def print_terms_and_coef(row, terms, coef):
indices = sorted(row.indices, key=lambda x: coef[x])
print 'Top Terms:'
for i in indices:
if coef[i] != 0:
print terms[i], "%.3f" % coef[i]
print
def error_analysis(clf, predicted, predicted_proba, X, tweets, terms):
print_top_terms(clf, np.array(terms))
print '\nERRORS:'
for i in range(predicted_proba.shape[0]):
probability = predicted_proba[i][predicted[i]]
# If we're very wrong.
if predicted[i] != y_test[i] and probability > .97:
print '\npred=%d (%g) truth=%d \ntext=%s ' % (predicted[i],
probability,
y_test[i],
tweets[i])
print_terms_and_coef(X.getrow(i), terms, clf.coef_[0])
error_analysis(lrl2, pred, proba, x_test_bow, x_test, vct_sent.get_feature_names())
In [38]:
print "Sentence distribution", 1.* y_sent.sum()/y_sent.shape[0]
print y_sent.shape[0]
In [63]:
order = proba.argmax(axis=1)
maxprob= proba.min(axis=1)
plt.hist([maxprob[order==0], maxprob[order==1]], bins=np.arange(0.0, .5,.1), label=['y=0', 'y=1'])
plt.legend(loc='best')
Out[63]:
In [ ]: