In [1]:
import sys
import os
import pandas as pd
import numpy as np
stdout = sys.stdout
reload(sys)
sys.setdefaultencoding('utf-8')
sys.stdout = stdout
In [ ]:
# save the new results
output_filename = "realDonaldTrump_tweets_analyzed.csv"
output_path = os.path.join(os.path.expanduser("~"), "TrumpTwitterAnalysis", "Data", output_filename)
tweets_pd.to_csv(output_path, encoding='utf-8', index=False)
In [ ]:
# use only the twitter texts to train a word2vec model
from basic_analysis import filter_set
m2vmethod = 1 # methodType: 0: CBOW; 1: skip-gram
epoch = 40
nFeatures = 500
token_texts = [text.split() for text in texts if len(text) > 0]
#token_texts = [text for text in texts]
#w2v_Trump = w2v_analyzer.w2v_analyzer(token_texts, m2vmethod, nFeatures, epoch, filter_set)
#w2v_Trump.fit(token_texts)
from gensim.models import Word2Vec
w2v_Trump = Word2Vec(token_texts, sg=m2vmethod, iter=40, size=nFeatures, min_count=1, window=5, workers=2)
#w2v_Trump.wv['vote']
w2v_Trump.wv.most_similar(positive=['hillary'])
In [ ]:
nTopics = 20
nCluster = 1000
max_ngram = 1
## read LDA
from feature_extraction import *
from decomposition_analysis import *
LDApickle_filename = 'LDAdata_' + str(nTopics) + 'topics_'+ str(nCluster) + 'clusters_' + str(max_ngram) + 'ngram.pkl'
LDApickle_path = os.path.join(os.path.expanduser("~"), "TrumpTwitterAnalysis", "Pickles", LDApickle_filename)
(extractor_LDAex, X_LDAex, featureNames_LDAex, X_trans, analyzer) = pd.read_pickle(LDApickle_path)
In [ ]:
import pyLDAvis, pyLDAvis.sklearn
pyLDAvis.enable_notebook()
data_pyLDAvis = pyLDAvis.sklearn.prepare(analyzer, X_LDAex, extractor_LDAex)
#pyLDAvis.show(data_pyLDAvis)
pyLDAvis.display(data_pyLDAvis, True)
In [ ]:
def get_decompositionResult(model, featureNames):
nTopics = model.components_.shape[0]
results0_featurename = list()
results0_possiblity = list()
topics_freq = np.zeros(nTopics)
for iTopic, topic in enumerate(model.components_):
topics_freq[iTopic] = np.sum(topic)
topic_featurename = list()
topic_possiblity = list()
for i in topic.argsort()[:-len(featureNames)-1:-1]:
topic_featurename.append(featureNames[i])
topic_possiblity.append(topic[i])
results0_featurename.append(topic_featurename)
results0_possiblity.append(topic_possiblity)
results_featurename = list()
results_possiblity = list()
for iTopic in topics_freq.argsort()[:-nTopics-1:-1]:
results_featurename.append(results0_featurename[iTopic])
results_possiblity.append(results0_possiblity[iTopic])
return results_featurename, results_possiblity
In [ ]:
LDAresults_featurename, LDAresults_possiblity = get_decompositionResult(analyzer, featureNames_LDAex)
nWords_show = 20
for iTopic in range(nTopics):
print 'Topic%d' % iTopic
print " ".join(LDAresults_featurename[iTopic][0:nWords_show])
#print LDAresults_possiblity[iTopic][0:nWords_show]
In [ ]:
from basic_analysis import filter_set
from w2v_analyzer import w2v_analyzer
w2vSource = 'GoogleNews'
m2vmethod = 1 # methodType: 0: CBOW; 1: skip-gram
w2vAnalyzer = w2v_analyzer(w2vSource, m2vmethod, 300, 40, filter_set)
In [99]:
# Topic keywords
myTopics = list()
# Topic 1
myTopics.append(['hillary', 'clinton', 'crooked', 'bernie', 'obama', 'years', 'campaign', 'bad'])
# Topic 2
myTopics.append(['fake', 'news', 'dishonest', 'media', 'failing', 'big', 'story', 'said'])
# Topic 3
myTopics.append(['make', 'america', 'great', 'again', 'together', 'we', 'american', 'people'])
# Topic 4
myTopics.append(['white', 'house', 'great', 'day', 'honor', 'today', 'meeting', 'senator'])
# Topic 5
myTopics.append(['obamacare', 'repeal', 'replace', 'disaster', 'crazy', 'bill', 'failed', 'reform'])
# Topic 6
myTopics.append(['join', 'live', 'rally', 'tomorrow', 'tonight', 'tickets', 'today', 'speech'])
# Topic 7
myTopics.append(['korea', 'north', 'south', 'china', 'trade', 'deficit', 'problem', 'president'])
# Topic 8
myTopics.append(['law', 'enforcement', 'officers', 'police', 'executive', 'order', 'killed', 'victims'])
# Topic 9
myTopics.append(['trump', 'vote', 'poll', 'team', 'voters', 'final', 'americans', 'debate'])
# Topic 10
myTopics.append(['jobs', 'bring', 'back', 'dollars', 'optimism', 'economic', 'market', 'companies'])
# Topic 11
myTopics.append(['enjoy', 'interviewed', 'tonight', 'looking', 'forward', 'interview', 'prime', 'minister'])
# Topic 12
myTopics.append(['heroes', 'veterans', 'honor', 'today', 'act', 'announced', 'american', 'lives'])
# Topic 13
myTopics.append(['islamic', 'terror', 'radical', 'ban', 'tough', 'allowed', 'border', 'immigration'])
# Topic 14
myTopics.append(['fbi', 'russia', 'cia', 'emails', 'director', 'illegally', 'investigation', 'server'])
# Topic 15
myTopics.append(['mexico', 'wall', 'pay', 'trade', 'deficit', 'plant', 'crime', 'deal'])
# Topic 16
myTopics.append(['syria', 'ISIS', 'syrian', 'refugees', 'immigrants', 'putin', 'rebels', 'ceasefire'])
myTopicsNames = ['HRC', 'fakeNews', 'MAGA', 'whitehouse', 'healthcare', 'join', 'korea&china', 'police', 'vote', 'jobs',
'interview', 'veterans', 'terror', 'fbi', 'mexico', 'refugee']
nMyTopics = len(myTopicsNames)
In [ ]:
for i, theTopic in enumerate(myTopics):
for keyword in theTopic:
try:
wv = w2vAnalyzer.model.wv[keyword]
except KeyError:
print 'In Topic' + str(i+1) + ', keyword: ' + keyword + ' not found in w2v vocabulary!'
In [3]:
'''
read data
'''
input_filename = "realDonaldTrump_tweets_analyzed.csv"
input_path = os.path.join(os.path.expanduser("~"), "TrumpTwitterAnalysis", "Data", input_filename)
tweets_pd = pd.read_csv(input_path)
In [ ]:
# 242, 467, 490
irow = 296
print tweets_pd.iloc[irow]['text']
print tweets_pd.iloc[irow]['normalText']
print tweets_pd.iloc[irow][myTopicsNames]
print tweets_pd.iloc[irow][[topicName+'_trans' for topicName in myTopicsNames]]
In [4]:
simu = tweets_pd[myTopicsNames].values
simu_trans = tweets_pd[[topicName+'_trans' for topicName in myTopicsNames]].values
In [ ]:
from datetime import date, timedelta
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
#prop_cycle = plt.rcParams['axes.prop_cycle']
#colors = prop_cycle.by_key()['color']
from colors import cdict
colorNames = ['blue', 'black', 'red', 'brown', 'cyan', 'orange', 'skyblue', 'purple', 'tomato', 'yellow',
'tan', 'magenta', 'green', 'darkblue', 'yellowgreen', 'gray']
colors = [cdict[cn] for cn in colorNames]
In [ ]:
import time
dates = []
dates_day = []
simu_dates = []
nTweets_date = []
lastDate = date(2000, 1, 1)
nDays = 0
simu_trans_valid = None
for irow, row in tweets_pd.iterrows():
if row['w2vTokens_count'] == 0:
continue
theDate = date(row['created_y'], row['created_m'], 1) # row['created_d']
if simu_trans_valid is None:
simu_trans_valid = simu_trans[irow].reshape(1, -1)
else:
simu_trans_valid = np.vstack((simu_trans_valid, simu_trans[irow].reshape(1, -1)))
if theDate != lastDate:
nDays += 1
dates.append(theDate)
nTweets_date.append(1)
else:
nTweets_date[-1] += 1
lastDate = theDate
print np.mean(simu_trans_valid, axis=0)
In [ ]:
# form days during the simulation period
dates_day.append(dates[0])
for i, theDate in enumerate(dates):
for d in range(30, 1, -1):
if theDate.month == 2 and d >= 29:
continue
theDate_day = date(theDate.year, theDate.month, d)
if theDate_day < dates[0] and theDate_day > dates[-1]:
dates_day.append(theDate_day)
dates_day.append(dates[-1])
ts_month = [time.mktime(theDate.timetuple()) for theDate in dates]
ts_day = [time.mktime(theDate_day.timetuple()) for theDate_day in dates_day]
irow = 0
for nTweets_day in nTweets_date:
dateSimu = np.mean(simu_trans_valid[irow:irow + nTweets_day], axis=0)
irow += nTweets_day
simu_dates.append(dateSimu)
simu_dates = np.array(simu_dates)
assert irow == simu_trans_valid.shape[0]
assert len(nTweets_date) == simu_dates.shape[0]
from sklearn.preprocessing import scale
simu_dates = scale(simu_dates, axis=0, with_mean=True, with_std=True, copy=False)
In [ ]:
from scipy.interpolate import interp1d
#plt.figure(1, figsize=(20, 10))
plt.figure(1, figsize=(24, 8))
#ax = plt.subplot(1, 1, 1)
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
# group the topics to show clearly
#kind_set = [[0, 2, 5, 8], [1, 3, 10, 13], [4, 7, 9, 11], [6, 12, 14, 15]]
#kind_set = [[0, 1, 9, 6], [2, 10, 4, 12], [5, 3, 7, 14], [8, 13, 11, 15]]
kind_set = [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]
myTopicsNames_rearrange = list()
for ks in kind_set:
for k in ks:
myTopicsNames_rearrange.append(myTopicsNames[k])
ls = list()
for k, ks in enumerate(kind_set):
ax = plt.subplot(len(kind_set), 1, k+1)
for j in range(nMyTopics):
if j in ks:
# plt.plot(dates, simu_dates[:, j], color=colors[j], label=myTopicsNames[j])
#ax.scatter(dates, simu_dates[:, j], color=colors[j])
f = interp1d(ts_month, simu_dates[:, j], kind='zero') # kind='zero' 'nearest' 'slinear' 'quadratic'
l, = ax.plot(dates_day, f(ts_day), color=colors[j], linewidth=3) #
ls.append(l)
plt.xlim((dates_day[-1]+timedelta(days=-2), dates_day[0]+timedelta(days=2)))
box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width*0.8, box.height])
ax.titlesize = 12
dates_forward = dates
dates_forward.reverse()
ax.set_xticks(dates_forward)
ax.set_xticklabels(dates_forward, rotation=90)
plt.ylim((-2.5, 3.0))
plt.ylabel('\'Hotness\'', color='black') # , fontweight='bold'
plt.legend(ls, myTopicsNames_rearrange, bbox_to_anchor=(1.13, 4.65), ncol=1, fontsize=10, labelspacing=1.5, frameon=False)
#plt.legend(ls, myTopicsNames_rearrange, bbox_to_anchor=(1.13, 4.65), ncol=1, fontsize=12, labelspacing=1.0, frameon=False)
plt.gcf().autofmt_xdate()
#plt.savefig("topics_time.png")
plt.show()
In [ ]:
irow = 15
print dates[len(dates)-1-irow]
print simu_dates[len(dates)-1-irow]
In [ ]:
# given a Topic, list the most similar tweets in a specific month
y = 2017
m = 6
iTopic = 8
print 'For Topic [' + myTopicsNames[iTopic-1] + ']:'
tweets_month = tweets_pd[(tweets_pd['created_y'] == y) & (tweets_pd['created_m'] == m)]
text_month = tweets_month['normalText'].values
simu_month = tweets_month[myTopicsNames[iTopic-1]+'_trans'].values
sorted_simu_month = simu_month.argsort()[:-len(simu_month)-1:-1]
sorted_text_month = [text_month[i] for i in sorted_simu_month]
#[str(topic[i]) + '*' + featureNames[i] for i in topic.argsort()[:-n_top_words-1:-1]]
for i in range(min(15, len(tweets_month))):
print sorted_text_month[i], simu_month[sorted_simu_month[i]]
print str(y)+'-'+str(m)+'-'+str(tweets_month.iloc[sorted_simu_month[i]]['created_d']), str(tweets_month.iloc[sorted_simu_month[i]]['created_h']) + ':' + str(tweets_month.iloc[sorted_simu_month[i]]['created_min'])
In [58]:
# extract data
nTweets_full = 3553
X0 = None
Y = None
sources = list()
for irow, row in tweets_pd.iterrows():
if row['w2vTokens_count'] == 0 or irow >= nTweets_full:
continue
rX = row[[topicName + '_trans' for topicName in myTopicsNames]].values.reshape(1, -1)
rY = row[['favorite_count', 'retweet_count']].values.reshape(1, -1)
sources.append(row['source'])
if X0 is None:
X0 = rX
Y = rY
else:
X0 = np.vstack((X0, rX))
Y = np.vstack((Y, rY))
nD = X.shape[0]
In [32]:
y = Y[:, 0]
from transformers import LabelConverter
label = LabelConverter().fit_transform(y)
In [33]:
from sklearn.preprocessing import StandardScaler
X1 = StandardScaler().fit_transform(X0)
from transformers import FilterSimu
X = FilterSimu(0.99).fit_transform(X1)
In [34]:
print X0.shape
print Y.shape
In [35]:
from sklearn.svm import LinearSVC
rng = np.random.RandomState(1)
p = 'l2'
C = 1.0
classifier = LinearSVC(penalty=p, loss='squared_hinge', dual=False, tol=0.0001, C=C, multi_class='ovr',
fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=rng,
max_iter=1000)
classifier.fit(X, label)
label_pred = classifier.predict(X)
score = classifier.score(X, label)
print classifier.coef_
print classifier.intercept_
print score
In [19]:
test_X = np.zeros((1,16))
test_X[0, 9] = 1
print classifier.predict(test_X)
In [40]:
from sklearn.linear_model import LinearRegression
for i in range(nMyTopics):
print myTopicsNames[i]
LRtopic = LinearRegression().fit(X1[:, i].reshape(-1, 1), label)
print LRtopic.coef_, LRtopic.intercept_
print LRtopic.score(X1[:,i].reshape(-1, 1), label)
In [41]:
print X
In [46]:
nz= np.nonzero(X)
print nz[0]
print nz[1]
print nz[0].shape
print nz[1].shape
In [48]:
nz_row, nz_col = np.nonzero(X)
In [171]:
counts = np.zeros((2, nMyTopics))
num_counts = np.zeros(nMyTopics)
for i in range(nz_row.shape[0]):
num_counts[nz_col[i]] += 1
counts[0, nz_col[i]] += Y[nz_col[i], 0]
counts[1, nz_col[i]] += Y[nz_col[i], 1]
counts[0, :] /= num_counts
counts[1, :] /= num_counts
for j in range(nMyTopics):
line = myTopicsNames[j] + ': ave_favorite = ' + str(counts[0, j])
line += '; ave_retweet = ' + str(counts[1, j])
line += '; retweet/favorite rate = ' + str(counts[1, j]/counts[0, j])
print line
In [86]:
# source analysis
from collections import Counter
sources_counter = Counter(sources)
print sources_counter
cs = sources_counter.values()
cs = [100*c/float(sum(cs)) for c in cs]
for s, c in zip(sources_counter.keys(), cs):
print s + ": " + str(c)
#for s, c in sourcers_counter.items():
set_sources = set(sources)
list_sources = list(set_sources)
dict_sources = {s: i for i, s in enumerate(list_sources)}
print list_sources
print dict_sources
#label_source = [dict_sources[source] for source in sources]
#label_source = np.array(label_source).reshape(-1, 1)
In [87]:
counts_source = np.zeros((len(set_sources), nMyTopics))
for i in range(nz_row.shape[0]):
source = sources[nz_row[i]]
counts_source[dict_sources[source], nz_col[i]] += 1
counts_source /= np.sum(counts_source, axis=0)
rows_show = [0, 2, 3, 4, 6, 7]
list_sources_show = [source for i, source in enumerate(list_sources) if i in rows_show]
counts_source_show = counts_source[rows_show]
for j in range(nMyTopics):
print myTopicsNames[j]
for i in range(counts_source_show.shape[0]):
print list_sources_show[i] + ': ' + str(100*counts_source_show[i, j])
In [81]:
print counts_source
print dict_sources
print list_sources
In [82]:
from operator import itemgetter
print sorted(dict_sources, key=itemgetter(1))
In [170]:
# time analysis
delta_hour = 1
hours_grid = np.linspace(0, 24-delta_hour, 24/delta_hour)
hours_grid = [int(h) for h in hours_grid]
nhgrid = len(hours_grid)
count_times = np.zeros(nhgrid)
topic_times = np.zeros((nhgrid, nMyTopics))
for irow, row in tweets_pd.iterrows():
if row['w2vTokens_count'] == 0 or irow >= nTweets_full:
continue
h = row['created_h']
ih = nhgrid-1
for i in range(nhgrid-1):
if h >= hours_grid[i] and h < hours_grid[i+1]:
ih = i
break
count_times[ih] += 1
for j, topicName in enumerate(myTopicsNames):
topic_times[ih, j] += row[topicName + '_trans']
print count_times
count_times /= sum(count_times)
# calculate the relative strength
for j in range(nMyTopics):
topic_times[:, j] /= count_times
# normalize
from sklearn.preprocessing import normalize
topic_times = normalize(topic_times, norm='l1', axis=0)
In [166]:
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
from datetime import time
x_times = list()
for i in range(nhgrid):
x_times.append(time(hours_grid[i], 30, 0))
delta_ticks = 2
hours_ticks = np.linspace(0, 24-delta_ticks, 24/delta_ticks)
xtick_times = [time(int(h), 0, 0) for h in hours_ticks]
plt.figure()
ax1 = plt.subplot(1, 1, 1)
plt.plot(x_times, count_times, '-k', linewidth=3)
ax1.set_xticks(xtick_times)
plt.xlim((time(0, 0, 0), time(23, 59, 59)))
plt.xlabel('Time')
plt.ylabel('Percentage of tweets')
plt.gcf().autofmt_xdate()
plt.show()
In [169]:
from colors import cdict
colorNames = ['blue', 'black', 'red', 'brown', 'cyan', 'orange', 'skyblue', 'purple', 'tomato', 'yellow',
'tan', 'magenta', 'green', 'darkblue', 'yellowgreen', 'gray']
colors = [cdict[cn] for cn in colorNames]
kind_set = [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]
myTopicsNames_rearrange = list()
for ks in kind_set:
for k in ks:
myTopicsNames_rearrange.append(myTopicsNames[k])
from sklearn.preprocessing import scale
count_times_std = scale(count_times.reshape(-1, 1), axis=0)
topic_times_std = scale(topic_times, axis=0)
plt.figure()
for k, ks in enumerate(kind_set):
ax = plt.subplot(2, 2, k+1)
ax.plot(x_times, count_times_std, '--')
for j in range(nMyTopics):
if j in ks:
#ax.plot(x_times, topic_times[:, j], color=colors[j], linewidth=3, label=myTopicsNames[j])
ax.plot(x_times, topic_times_std[:, j], color=colors[j], linewidth=3, label=myTopicsNames[j])
ax.set_xticks(xtick_times)
ax.legend()
plt.xlim((time(0, 0, 0), time(23, 59, 59)))
plt.xlabel('Time')
plt.ylabel('Relative Strength')
plt.gcf().autofmt_xdate()
plt.show()
In [161]:
print np.mean(count_times_std), np.std(count_times_std)
In [162]:
print count_times_std
In [ ]: