TWITTER

Mining & Analyzing Social Media Content: Using Tweets and Posts to Gauge Consumer Sentiment (workshop)

This Jupyter Notebook will guide researchers through the process of mining and visualizing content from Twitter. Keyword searching across a sample of 7 days of Twitter posts. Results downloaded to structured csv table.

You will need a Twitter account to register for Twitter consumer and access keys. Register here: https://apps.twitter.com/

STEP 1

PROCESSING :: Import Libraries

OUTPUT :: Toggle Button


In [ ]:
cursor='  >> '

### fix for Python 2.7
import codecs
try:
    import vaderSentiment
except:
    print cursor, 'vaderSentiment not installed'
    print cursor, cursor, '...installing'
    !pip install vaderSentiment
    import vaderSentiment
    print cursor, cursor, '...success\n'
with codecs.open(vaderSentiment.__file__.replace('__init__.pyc','vaderSentiment.py'), 'r', encoding='utf8') as original: data = original.read()
if not 'from io import open' in data:
    with codecs.open(vaderSentiment.__file__.replace('__init__.pyc','vaderSentiment.py'), 'w', encoding='utf8') as modified: modified.write('# This Python file uses the following encoding: utf-8\nfrom io import open\n' + data)
###
        
print cursor, 'Importing Libraries'
try:
    import tweepy
except:
    print cursor, 'tweepy not installed'
    print cursor, cursor, '...installing'
    !pip install tweepy
    import tweepy
    print cursor, cursor, '...success\n'
import re, string, warnings, os
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import ipywidgets as widgets
from IPython.display import clear_output
from Tkinter import *
from tkFileDialog import askopenfilename
import matplotlib.pyplot as plt
from os import path
try:
    from wordcloud import WordCloud
except:
    print cursor, 'wordcloud not installed'
    print cursor, cursor, '...installing'
    !pip install wordcloud
    from wordcloud import WordCloud
    print cursor, cursor, '...success\n'
import pandas as pd
try:
    warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
    from gensim import corpora, models
except:
    print cursor, 'gensim not installed'
    print cursor, cursor, '...installing'
    !pip install gensim
    warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
    from gensim import corpora, models
    print cursor, cursor, '...success\n'
print cursor, 'Success!'

toggle = widgets.ToggleButtons(
    options=['Query API', 'Import Table'],
#     value='pineapple',
    description='Tweets:',
    disabled=False
)
toggle

STEP 2

INPUT :: Consumer Key, Consumer Secret, Access Token, Access Secret

OR

INPUT :: Browse for Saved Table

PROCESS :: Test Twitter API Codes or Imported Table Structure

OUTPUT :: Verify Success of Processes


In [ ]:
# Can Leave Blank if Importing
ckey = ''
csecret = ''
atoken = ''
asecret = ''

# function to browse for file
def browse_file():
    root = Tk()
    zotero=askopenfilename()
    root.update()
#     root.destroy()     # Macs are unable to recreate after destroy - use withdraw()
    root.withdraw()
    return zotero

if toggle.value == 'Query API':
    s_term = widgets.Text()
    print '\n', cursor, 'Testing API Connection'
    OAUTH_KEYS = {'consumer_key':ckey, 'consumer_secret':csecret, 'access_token_key':atoken, 'access_token_secret':asecret}
    auth = tweepy.OAuthHandler(OAUTH_KEYS['consumer_key'], OAUTH_KEYS['consumer_secret'])
    api = tweepy.API(auth)

    try:
        user = api.get_user('baylor_ds')
        print cursor, 'Success!'
        print '\n', cursor, 'Enter search below:'
        print cursor, '(double-quotes around phrases)'
        print cursor, '(# in front of a hashtag search)'
    except:
        print cursor, 'Something has gone amiss. Check the consumer and access codes'

else:
    try:
        s_term.close()
    except:
        pass
    print cursor, 'Browse for table'
    imported_table = browse_file()
    if imported_table != '':
#         print cursor, 'Success!'
        s_term = cursor+' Success!'
    else:
        s_term = cursor+' Run code block to try again'
#         s_term = cursor, 'Run code block to try again'
#         print cursor, 'Run code block to try again'
        

s_term
# s_term.close()

STEP 3

PROCESS :: Obatin Twitter Content, Calculate Sentiment

OUTPUT :: Data Table


In [ ]:
def strip_non_ascii(cleanme):
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    stripped = regex.sub(' ', cleanme)
    stripped = stripped.replace('\n','')
    stripped = stripped.replace(',','')
    stripped = (c for c in stripped if 0 < ord(c) < 127)
    return ''.join(stripped)

def sent(line):
    analyzer = SentimentIntensityAnalyzer()
    vs = analyzer.polarity_scores(line)
    return float(vs['compound'])

def check_words(text):
    return ' '.join([word for word in text.lower().split() if (word not in (stopwords.words('english')) and not 'http' in word and len(word)>=4 and not word.isdigit() and not word in search and not search in word and 'amp' not in word)])


i = 1
positive = 0
negative = 0
neutral = 0
all_text=''
sent_dict={}
lang_list = []
positive_list = ''
negative_list = ''
neutral_list = ''
documents = []


if toggle.value == 'Query API':
    search = s_term.value
    limit = 0    # Leave 0 for all possible results
    only_english = 'true'

    f=open('tweets.csv','w')
    f.write('author,screen_name,date-time,hashtags,expanded-url,tweet,Retweeted,Likes,location,timezone,coordinates,sentiment,language\n')

    print cursor, 'Processing Twitter API Query'
    try:
        for tweet in tweepy.Cursor(api.search, q=(search), include_entities=True, tweet_mode='extended').items(limit):

            tweet_list = []
            tweet_list[:] = []

            pass_through = 0
            if only_english=='true' and tweet.lang=='en':
                pass_through = 1
            elif only_english=='false':
                pass_through = 1

            if pass_through==1:
                tweet_list.append(strip_non_ascii(tweet.author.name.encode('utf8')))
                tweet_list.append(strip_non_ascii(tweet.author.screen_name.encode('utf8')))
                tweet_list.append(str(tweet.created_at))
                try:
                    url = str(tweet).split('\'expanded_url\': u\'')[1]
                    url = url.split(', u')[0]
                except:
                    url = ''
                hash = ''
                for item in tweet.entities.get('hashtags'):
                    hash = hash+item.get('text')+';'
                tweet_list.append(strip_non_ascii(hash[:-1]))
                if not 'twitter.com' in url and not ',' in url:
                    tweet_list.append(url.replace('\'',''))
                else:
                    tweet_list.append('')
                if 'retweeted_status' in str(tweet):
                    tweet_list.append(strip_non_ascii(tweet.retweeted_status.full_text.encode('utf8')))
                    rt = 'TRUE'
                else:
                    tweet_list.append(strip_non_ascii(tweet.full_text.encode('utf8')))
                    rt = 'FALSE'
                tweet_list.append(rt)
                tweet_list.append(str(tweet.favorite_count))
                tweet_list.append(strip_non_ascii(str(tweet.user.location.encode('utf8'))))
                tweet_list.append(strip_non_ascii(str(tweet.user.time_zone)))
                if 'type' in str(tweet.geo):
                    tweet_list.append(str(tweet.geo).replace(',',''))
                else:
                    tweet_list.append('Not Provided')
    #                 tweet_list.append(str(tweet.geo).replace(',',''))
                sentiment = sent(tweet_list[5])
                if sentiment >= .25:
                    positive+=1
                    positive_list+=tweet_list[5]
                elif sentiment <= -.25:
                    negative+=1
                    negative_list+=tweet_list[5]
                else:
                    neutral+=1    
                    neutral_list+=tweet_list[5]
                tweet_list.append(str(sentiment))
                all_text+=tweet_list[5]
                tweet_list.append(tweet.lang)
                for item in tweet_list:
                    f.write(item + ',')
                f.write('\n')
                if i%25 == 0 or i == 1:
                    print cursor, 'downloaded', str(i), 'tweets ...'
                    clear_output(wait=True)
                    print cursor, 'Processing Twitter API Query'
                    print cursor, 'downloaded', str(i), 'tweets ...'
                i+=1
                sent_dict[tweet_list[5]] = sentiment
                lang_list.append(tweet.lang)
                documents.append(strip_non_ascii(tweet_list[5]))
    except:
        pass

    f.close()
    imported_table = 'tweets.csv'
    print cursor, 'downloaded', str(i-1), 'tweets ...'
    print cursor, 'done!'

# Enable all fields to view
pd.set_option('display.max_columns', None)

# Read CSV to dataframe
df=pd.read_csv(open(imported_table,'rU'), index_col=False, encoding='utf-8', engine='c')

if toggle.value == 'Import Table':
    print cursor, 'Parsing imported table'
    for i in range(0,len(df)-1):
        tweetval = df['tweet'].iloc[i]
        sentval=df['sentiment'].iloc[i]
        langval=df['language'].iloc[i]
        if sentval >= .25:
            positive+=1
            positive_list+=str(tweetval)
        elif sentval <= -.25:
            negative+=1
            negative_list+=str(tweetval)
        else:
            neutral+=1
            neutral_list+=str(tweetval)
        all_text+=str(tweetval)
        sent_dict[tweetval]=sentval
        lang_list.append(langval)
        documents.append(strip_non_ascii(str(tweetval)))
    print cursor, 'done!'
    
print '\n', cursor, os.path.abspath(imported_table)
df

STEP 4

PROCESS :: Calculate Sentiment % and Top/Bottom Tweets

OUTPUT :: Pie Chart, Top/Bottom Tweets


In [ ]:
plt.rcParams["figure.figsize"] = [25,15]
plt.rcParams.update({'font.size': 22})

# Data to plot
labels = 'positive', 'negative', 'neutral'
sizes = [positive, negative, neutral]
colors = ['gold', 'yellowgreen', 'lightcoral']
if positive >= negative:
    explode = (0.1, 0, 0)  # explode 1st slice
else:
    explode = (0.0, 0.1, 0)  # explode 1st slice

plt.pie(sizes, explode=explode, labels=labels, colors=colors,
        autopct='%1.1f%%', shadow=True, startangle=140)
 
plt.axis('equal')
plt.show()

a=1
print 'TOP SENSTIMENT TWEETS\n'
newA = sorted(sent_dict, key=sent_dict.get, reverse=True)[1:4]
for item in newA:
    print str(a), '-', item
    a+=1

a=1
print '\n\nBOTTOM SENSTIMENT TWEETS\n'
newA = sorted(sent_dict, key=sent_dict.get, reverse=False)[1:4]
for item in newA:
    print str(a), '-', item
    a+=1

STEP 5

INPUT :: Browse for English Word List

PROCESS :: Filter Words

OUTPUT :: Word Clouds


In [ ]:
import wordcloud

print cursor, 'browse for English word list'
f=open(browse_file(),'r')
english=f.read()
f.close()
print cursor, 'success!'

print '\n', cursor, 'parsing each word'
e_words=[]
e_words[:]=[]
for line in english.splitlines():
    e_words.append(line)
print cursor, 'success!'
    
def filter_words(document,e_words):
    words = [word for word in document.split() if word in e_words and len(word)>3]
    return ' '.join(words)

print '\n', cursor, 'creating word cloud #1'
plt.rcParams["figure.figsize"] = [25,15]
plt.rcParams.update({'font.size': 22})
print 'ALL TWEETS'
wordcloud = WordCloud().generate(filter_words(all_text,e_words))
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

print 'POSITIVE TWEETS'
wordcloud = WordCloud().generate(filter_words(positive_list,e_words))
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

print 'NEGATIVE TWEETS'
wordcloud = WordCloud().generate(filter_words(negative_list,e_words))
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

STEP 6

PROCESS :: Generate LSI Topics

OUTPUT :: Topics per Sentiment

NOTES :: Latent Semantic Indexing - https://en.wikipedia.org/wiki/Latent_semantic_analysis#Latent_semantic_indexing


In [ ]:
def topicmodel(documents, e_words):
    texts = [[word for word in document.lower().split() if (word in e_words and len(word)>3)] for document in documents]

    # remove words that appear only once
    from collections import defaultdict
    frequency = defaultdict(int)
    for text in texts:
        for token in text:
            frequency[token] += 1

    texts = [[token for token in text if frequency[token] > 1]
             for text in texts]

    dictionary = corpora.Dictionary(texts)
    # dictionary.save('/tmp/twitter.dict')

    corpus = [dictionary.doc2bow(text) for text in texts]
    # corpora.MmCorpus.serialize('/tmp/twitter.mm', corpus)

    tfidf = models.TfidfModel(corpus)

    corpus_tfidf = tfidf[corpus]

    lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=5)

    corpus_lsi = lsi[corpus_tfidf]

    topiclist=lsi.print_topics(5)

    # lsi.print_topics(5)

    x=1
    for item in topiclist:
        print '\n', str(x),
        for topic in re.findall('"([^"]*)"', item[1]):
            print topic,
        x+=1



all_=[]
pos=[]
neg=[]
neu=[]
documents = []
documents[:] = []

for k, v in sent_dict.iteritems():
    all_.append(str(k))
    if v>= .25:
        pos.append(str(k))
    elif v<= -.25:
        neg.append(str(k))
    else:
        neu.append(str(k))

i=0
while i < 4:
    documents[:] = []

    if i==0:
        print '\nTopics for All Tweets'
        topicmodel(all_, e_words)
    elif i==1:
        print '\n\nTopics for Positive Tweets'
        topicmodel(pos, e_words)
    elif i==2:
        print '\n\nTopics for Negative Tweets'
        topicmodel(neg, e_words)
    elif i==3:
        print '\n\nTopics for Neutral Tweets'
        topicmodel(neu, e_words)
    i+=1

In [ ]: