This Jupyter Notebook will guide researchers through the process of mining and visualizing content from Twitter. Keyword searching across a sample of 7 days of Twitter posts. Results downloaded to structured csv table.
You will need a Twitter account to register for Twitter consumer and access keys. Register here: https://apps.twitter.com/
In [ ]:
cursor=' >> '
### fix for Python 2.7
import codecs
try:
import vaderSentiment
except:
print cursor, 'vaderSentiment not installed'
print cursor, cursor, '...installing'
!pip install vaderSentiment
import vaderSentiment
print cursor, cursor, '...success\n'
with codecs.open(vaderSentiment.__file__.replace('__init__.pyc','vaderSentiment.py'), 'r', encoding='utf8') as original: data = original.read()
if not 'from io import open' in data:
with codecs.open(vaderSentiment.__file__.replace('__init__.pyc','vaderSentiment.py'), 'w', encoding='utf8') as modified: modified.write('# This Python file uses the following encoding: utf-8\nfrom io import open\n' + data)
###
print cursor, 'Importing Libraries'
try:
import tweepy
except:
print cursor, 'tweepy not installed'
print cursor, cursor, '...installing'
!pip install tweepy
import tweepy
print cursor, cursor, '...success\n'
import re, string, warnings, os
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import ipywidgets as widgets
from IPython.display import clear_output
from Tkinter import *
from tkFileDialog import askopenfilename
import matplotlib.pyplot as plt
from os import path
try:
from wordcloud import WordCloud
except:
print cursor, 'wordcloud not installed'
print cursor, cursor, '...installing'
!pip install wordcloud
from wordcloud import WordCloud
print cursor, cursor, '...success\n'
import pandas as pd
try:
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
from gensim import corpora, models
except:
print cursor, 'gensim not installed'
print cursor, cursor, '...installing'
!pip install gensim
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
from gensim import corpora, models
print cursor, cursor, '...success\n'
print cursor, 'Success!'
toggle = widgets.ToggleButtons(
options=['Query API', 'Import Table'],
# value='pineapple',
description='Tweets:',
disabled=False
)
toggle
In [ ]:
# Can Leave Blank if Importing
ckey = ''
csecret = ''
atoken = ''
asecret = ''
# function to browse for file
def browse_file():
root = Tk()
zotero=askopenfilename()
root.update()
# root.destroy() # Macs are unable to recreate after destroy - use withdraw()
root.withdraw()
return zotero
if toggle.value == 'Query API':
s_term = widgets.Text()
print '\n', cursor, 'Testing API Connection'
OAUTH_KEYS = {'consumer_key':ckey, 'consumer_secret':csecret, 'access_token_key':atoken, 'access_token_secret':asecret}
auth = tweepy.OAuthHandler(OAUTH_KEYS['consumer_key'], OAUTH_KEYS['consumer_secret'])
api = tweepy.API(auth)
try:
user = api.get_user('baylor_ds')
print cursor, 'Success!'
print '\n', cursor, 'Enter search below:'
print cursor, '(double-quotes around phrases)'
print cursor, '(# in front of a hashtag search)'
except:
print cursor, 'Something has gone amiss. Check the consumer and access codes'
else:
try:
s_term.close()
except:
pass
print cursor, 'Browse for table'
imported_table = browse_file()
if imported_table != '':
# print cursor, 'Success!'
s_term = cursor+' Success!'
else:
s_term = cursor+' Run code block to try again'
# s_term = cursor, 'Run code block to try again'
# print cursor, 'Run code block to try again'
s_term
# s_term.close()
In [ ]:
def strip_non_ascii(cleanme):
regex = re.compile('[%s]' % re.escape(string.punctuation))
stripped = regex.sub(' ', cleanme)
stripped = stripped.replace('\n','')
stripped = stripped.replace(',','')
stripped = (c for c in stripped if 0 < ord(c) < 127)
return ''.join(stripped)
def sent(line):
analyzer = SentimentIntensityAnalyzer()
vs = analyzer.polarity_scores(line)
return float(vs['compound'])
def check_words(text):
return ' '.join([word for word in text.lower().split() if (word not in (stopwords.words('english')) and not 'http' in word and len(word)>=4 and not word.isdigit() and not word in search and not search in word and 'amp' not in word)])
i = 1
positive = 0
negative = 0
neutral = 0
all_text=''
sent_dict={}
lang_list = []
positive_list = ''
negative_list = ''
neutral_list = ''
documents = []
if toggle.value == 'Query API':
search = s_term.value
limit = 0 # Leave 0 for all possible results
only_english = 'true'
f=open('tweets.csv','w')
f.write('author,screen_name,date-time,hashtags,expanded-url,tweet,Retweeted,Likes,location,timezone,coordinates,sentiment,language\n')
print cursor, 'Processing Twitter API Query'
try:
for tweet in tweepy.Cursor(api.search, q=(search), include_entities=True, tweet_mode='extended').items(limit):
tweet_list = []
tweet_list[:] = []
pass_through = 0
if only_english=='true' and tweet.lang=='en':
pass_through = 1
elif only_english=='false':
pass_through = 1
if pass_through==1:
tweet_list.append(strip_non_ascii(tweet.author.name.encode('utf8')))
tweet_list.append(strip_non_ascii(tweet.author.screen_name.encode('utf8')))
tweet_list.append(str(tweet.created_at))
try:
url = str(tweet).split('\'expanded_url\': u\'')[1]
url = url.split(', u')[0]
except:
url = ''
hash = ''
for item in tweet.entities.get('hashtags'):
hash = hash+item.get('text')+';'
tweet_list.append(strip_non_ascii(hash[:-1]))
if not 'twitter.com' in url and not ',' in url:
tweet_list.append(url.replace('\'',''))
else:
tweet_list.append('')
if 'retweeted_status' in str(tweet):
tweet_list.append(strip_non_ascii(tweet.retweeted_status.full_text.encode('utf8')))
rt = 'TRUE'
else:
tweet_list.append(strip_non_ascii(tweet.full_text.encode('utf8')))
rt = 'FALSE'
tweet_list.append(rt)
tweet_list.append(str(tweet.favorite_count))
tweet_list.append(strip_non_ascii(str(tweet.user.location.encode('utf8'))))
tweet_list.append(strip_non_ascii(str(tweet.user.time_zone)))
if 'type' in str(tweet.geo):
tweet_list.append(str(tweet.geo).replace(',',''))
else:
tweet_list.append('Not Provided')
# tweet_list.append(str(tweet.geo).replace(',',''))
sentiment = sent(tweet_list[5])
if sentiment >= .25:
positive+=1
positive_list+=tweet_list[5]
elif sentiment <= -.25:
negative+=1
negative_list+=tweet_list[5]
else:
neutral+=1
neutral_list+=tweet_list[5]
tweet_list.append(str(sentiment))
all_text+=tweet_list[5]
tweet_list.append(tweet.lang)
for item in tweet_list:
f.write(item + ',')
f.write('\n')
if i%25 == 0 or i == 1:
print cursor, 'downloaded', str(i), 'tweets ...'
clear_output(wait=True)
print cursor, 'Processing Twitter API Query'
print cursor, 'downloaded', str(i), 'tweets ...'
i+=1
sent_dict[tweet_list[5]] = sentiment
lang_list.append(tweet.lang)
documents.append(strip_non_ascii(tweet_list[5]))
except:
pass
f.close()
imported_table = 'tweets.csv'
print cursor, 'downloaded', str(i-1), 'tweets ...'
print cursor, 'done!'
# Enable all fields to view
pd.set_option('display.max_columns', None)
# Read CSV to dataframe
df=pd.read_csv(open(imported_table,'rU'), index_col=False, encoding='utf-8', engine='c')
if toggle.value == 'Import Table':
print cursor, 'Parsing imported table'
for i in range(0,len(df)-1):
tweetval = df['tweet'].iloc[i]
sentval=df['sentiment'].iloc[i]
langval=df['language'].iloc[i]
if sentval >= .25:
positive+=1
positive_list+=str(tweetval)
elif sentval <= -.25:
negative+=1
negative_list+=str(tweetval)
else:
neutral+=1
neutral_list+=str(tweetval)
all_text+=str(tweetval)
sent_dict[tweetval]=sentval
lang_list.append(langval)
documents.append(strip_non_ascii(str(tweetval)))
print cursor, 'done!'
print '\n', cursor, os.path.abspath(imported_table)
df
In [ ]:
plt.rcParams["figure.figsize"] = [25,15]
plt.rcParams.update({'font.size': 22})
# Data to plot
labels = 'positive', 'negative', 'neutral'
sizes = [positive, negative, neutral]
colors = ['gold', 'yellowgreen', 'lightcoral']
if positive >= negative:
explode = (0.1, 0, 0) # explode 1st slice
else:
explode = (0.0, 0.1, 0) # explode 1st slice
plt.pie(sizes, explode=explode, labels=labels, colors=colors,
autopct='%1.1f%%', shadow=True, startangle=140)
plt.axis('equal')
plt.show()
a=1
print 'TOP SENSTIMENT TWEETS\n'
newA = sorted(sent_dict, key=sent_dict.get, reverse=True)[1:4]
for item in newA:
print str(a), '-', item
a+=1
a=1
print '\n\nBOTTOM SENSTIMENT TWEETS\n'
newA = sorted(sent_dict, key=sent_dict.get, reverse=False)[1:4]
for item in newA:
print str(a), '-', item
a+=1
In [ ]:
import wordcloud
print cursor, 'browse for English word list'
f=open(browse_file(),'r')
english=f.read()
f.close()
print cursor, 'success!'
print '\n', cursor, 'parsing each word'
e_words=[]
e_words[:]=[]
for line in english.splitlines():
e_words.append(line)
print cursor, 'success!'
def filter_words(document,e_words):
words = [word for word in document.split() if word in e_words and len(word)>3]
return ' '.join(words)
print '\n', cursor, 'creating word cloud #1'
plt.rcParams["figure.figsize"] = [25,15]
plt.rcParams.update({'font.size': 22})
print 'ALL TWEETS'
wordcloud = WordCloud().generate(filter_words(all_text,e_words))
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
print 'POSITIVE TWEETS'
wordcloud = WordCloud().generate(filter_words(positive_list,e_words))
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
print 'NEGATIVE TWEETS'
wordcloud = WordCloud().generate(filter_words(negative_list,e_words))
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
In [ ]:
def topicmodel(documents, e_words):
texts = [[word for word in document.lower().split() if (word in e_words and len(word)>3)] for document in documents]
# remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
for token in text:
frequency[token] += 1
texts = [[token for token in text if frequency[token] > 1]
for text in texts]
dictionary = corpora.Dictionary(texts)
# dictionary.save('/tmp/twitter.dict')
corpus = [dictionary.doc2bow(text) for text in texts]
# corpora.MmCorpus.serialize('/tmp/twitter.mm', corpus)
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=5)
corpus_lsi = lsi[corpus_tfidf]
topiclist=lsi.print_topics(5)
# lsi.print_topics(5)
x=1
for item in topiclist:
print '\n', str(x),
for topic in re.findall('"([^"]*)"', item[1]):
print topic,
x+=1
all_=[]
pos=[]
neg=[]
neu=[]
documents = []
documents[:] = []
for k, v in sent_dict.iteritems():
all_.append(str(k))
if v>= .25:
pos.append(str(k))
elif v<= -.25:
neg.append(str(k))
else:
neu.append(str(k))
i=0
while i < 4:
documents[:] = []
if i==0:
print '\nTopics for All Tweets'
topicmodel(all_, e_words)
elif i==1:
print '\n\nTopics for Positive Tweets'
topicmodel(pos, e_words)
elif i==2:
print '\n\nTopics for Negative Tweets'
topicmodel(neg, e_words)
elif i==3:
print '\n\nTopics for Neutral Tweets'
topicmodel(neu, e_words)
i+=1
In [ ]: