This notebook is to explore various nlp pipelines/ topic modelling for twitter data collected from D4D discursive project



In [1]:

    
# some nlp tools
import spacy
import gensim
import pyLDAvis.gensim as gensimvis
import pyLDAvis









    



/Users/wwymak/anaconda/lib/python3.5/site-packages/gensim/utils.py:1015: UserWarning: Pattern library is not installed, lemmatization won't be available.
  warnings.warn("Pattern library is not installed, lemmatization won't be available.")
/Users/wwymak/anaconda/lib/python3.5/site-packages/funcy/decorators.py:56: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() instead
  spec = inspect.getargspec(func)
/Users/wwymak/anaconda/lib/python3.5/site-packages/funcy/decorators.py:56: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() instead
  spec = inspect.getargspec(func)
/Users/wwymak/anaconda/lib/python3.5/site-packages/funcy/decorators.py:56: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() instead
  spec = inspect.getargspec(func)
/Users/wwymak/anaconda/lib/python3.5/site-packages/funcy/decorators.py:56: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() instead
  spec = inspect.getargspec(func)
/Users/wwymak/anaconda/lib/python3.5/site-packages/funcy/decorators.py:56: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() instead
  spec = inspect.getargspec(func)
/Users/wwymak/anaconda/lib/python3.5/site-packages/funcy/decorators.py:56: DeprecationWarning: inspect.getargspec() is deprecated, use inspect.signature() instead
  spec = inspect.getargspec(func)



In [2]:

    
# tools to get data from s3
import boto3
from io import StringIO
import json
import requests



In [3]:

    
# would probably need these
import numpy as np
import pandas as pd
import itertools



In [4]:

    
nlp = spacy.load('en')

Connecting to S3

To connect to S3 programmatically (or any other AWS service), the official SDK for python is boto Remember to use a profile that has access rights to that bucket



In [5]:

    
boto_session = boto3.session.Session(profile_name = 'wwymakAdmin')
s3 = boto_session.resource('s3')

def to_StringIO(key, bucket_name, s3 = s3):
    obj = s3.Object(bucket_name, key)
    return StringIO(obj.get()['Body'].read())


bkt = 'discursive'

# you can change default here to be whatever profile works on your server for listing the dicursive bucket
s = boto3.session.Session(profile_name='wwymakAdmin')
s3 = s.resource('s3')
bucket = s3.Bucket(bkt)



In [24]:

    
# file format is YYYY/M/DD/HH/filename.json
prefix = '2017/1/16/'
# the base bucket name for the discursive project 
S3_BASE_URL = 'https://s3-us-west-2.amazonaws.com/discursive/'

Parsing tweets

Use spacy to tokenize tweets, then construct the dictionary and corpus from it using gensim



In [23]:

    
def parse_tweets(tweets_arr, tweets_dictionary, tweets_corpus, all_tweet_tokens, tweets_dictionary_filepath, tweets_corpus_filepath):
    #remove the retweets
    tweets_text_documents = [x['text'] for x in tweets_arr if not str.startswith(str.lower(x['text']), 'rt')]
    tokenized_tweets = []    
    # Process tweets using Spacy NLP pipeline.
    for doc in nlp.pipe(tweets_text_documents, n_threads=4, batch_size=100):
        ents = doc.ents  # Named entities.

        # Keep only words (no numbers, no punctuation).
        # Lemmatize tokens, remove punctuation and remove stopwords.
        doc = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]

        # Remove common words from a stopword list.
        #doc = [token for token in doc if token not in STOPWORDS]

        # Add named entities, but only if they are a compound of more than word.
        #doc.extend([str(entity) for entity in ents if len(entity) > 1])

        tokenized_tweets.append(doc)
        
    dictionary = gensim.corpora.Dictionary(tokenized_tweets)
    corpus = [dictionary.doc2bow(x) for x in tokenized_tweets]
#     print(len(corpus))
    all_tweet_tokens.extend(tokenized_tweets)
    tweets_dictionary = tweets_dictionary.merge_with(dictionary)        
        #save current dict
    tweets_dictionary.save(tweets_dictionary_filepath)
    tweets_corpus.extend(corpus)
    # save current corpus
    gensim.corpora.MmCorpus.serialize(tweets_corpus_filepath, tweets_corpus)



In [25]:

    
#initialise empty dictionary
tweets_dictionary = gensim.corpora.Dictionary([])
tweets_corpus = []
all_tweets_tokens = []
for obj in bucket.objects.filter(Prefix=prefix):
    filename = obj.key
    r = requests.get(S3_BASE_URL + filename)
    #make sure only to execut if response is successful
    if(r.status_code == 200):
        parse_tweets(r.json(), tweets_dictionary,tweets_corpus, all_tweets_tokens, 'tweets_dictionary.dict',  'tweets_corpus.mm')
        print(filename)









    



2017/1/16/1/tweets-47.json
2017/1/16/10/tweets-21.json
2017/1/16/10/tweets-7.json
2017/1/16/11/tweets-16.json
2017/1/16/11/tweets-28.json
2017/1/16/12/tweets-18.json
2017/1/16/12/tweets-30.json
2017/1/16/12/tweets-42.json
2017/1/16/13/tweets-21.json
2017/1/16/13/tweets-34.json
2017/1/16/13/tweets-46.json
2017/1/16/13/tweets-59.json
2017/1/16/13/tweets-7.json
2017/1/16/14/tweets-12.json
2017/1/16/14/tweets-27.json
2017/1/16/14/tweets-42.json
2017/1/16/14/tweets-56.json
2017/1/16/15/tweets-10.json
2017/1/16/15/tweets-24.json
2017/1/16/15/tweets-39.json
2017/1/16/15/tweets-53.json
2017/1/16/16/tweets-24.json
2017/1/16/16/tweets-37.json
2017/1/16/16/tweets-51.json
2017/1/16/16/tweets-7.json
2017/1/16/17/tweets-22.json
2017/1/16/17/tweets-38.json
2017/1/16/17/tweets-51.json
2017/1/16/17/tweets-7.json
2017/1/16/18/tweets-21.json
2017/1/16/18/tweets-35.json
2017/1/16/18/tweets-50.json
2017/1/16/18/tweets-6.json
2017/1/16/19/tweets-20.json
2017/1/16/19/tweets-36.json
2017/1/16/19/tweets-4.json
2017/1/16/19/tweets-49.json
2017/1/16/2/tweets-1.json
2017/1/16/2/tweets-19.json
2017/1/16/2/tweets-34.json
2017/1/16/2/tweets-51.json
2017/1/16/20/tweets-21.json
2017/1/16/20/tweets-37.json
2017/1/16/20/tweets-53.json
2017/1/16/20/tweets-6.json
2017/1/16/21/tweets-35.json
2017/1/16/21/tweets-49.json
2017/1/16/22/tweets-20.json
2017/1/16/22/tweets-34.json
2017/1/16/22/tweets-49.json
2017/1/16/22/tweets-5.json
2017/1/16/23/tweets-2.json
2017/1/16/23/tweets-20.json
2017/1/16/23/tweets-36.json
2017/1/16/23/tweets-51.json
2017/1/16/3/tweets-21.json
2017/1/16/3/tweets-36.json
2017/1/16/3/tweets-53.json
2017/1/16/3/tweets-7.json
2017/1/16/4/tweets-10.json
2017/1/16/4/tweets-27.json
2017/1/16/4/tweets-43.json
2017/1/16/4/tweets-58.json
2017/1/16/5/tweets-16.json
2017/1/16/5/tweets-36.json
2017/1/16/5/tweets-51.json
2017/1/16/6/tweets-24.json
2017/1/16/6/tweets-42.json
2017/1/16/7/tweets-17.json
2017/1/16/7/tweets-35.json
2017/1/16/7/tweets-54.json
2017/1/16/8/tweets-49.json
2017/1/16/9/tweets-7.json

If you already have a corpus and dictionary you can just load them.



In [13]:

    
current_corpus = gensim.corpora.MmCorpus('tweets_corpus.mm')
current_dictionary = gensim.corpora.Dictionary.load('tweets_dictionary.dict')



In [27]:

    
len(tweets_corpus)









    Out[27]:





107749



In [28]:

    
#train gensim lda model --no optimsation at the mo. Just experimenting
lda_model = gensim.models.LdaModel(tweets_corpus, id2word=tweets_dictionary, num_topics=30)



In [29]:

    
`#save model to file 
lda_model.save('twitter_lda.model')



In [30]:

    
#print some topics to see if they make any sense...
lda_model.print_topics(10, 5)









    Out[30]:





[(0,
  '0.035*"silent" + 0.029*"displeasure" + 0.028*"visibly" + 0.025*"million" + 0.021*"student"'),
 (28,
  '0.036*"makeamericagreatagain" + 0.031*"trabajador" + 0.028*"patulong" + 0.023*"pbs" + 0.023*"pray"'),
 (20,
  '0.038*"province" + 0.027*"al" + 0.023*"sanders" + 0.021*"kid" + 0.021*""'),
 (19,
  '0.030*"life" + 0.024*"address" + 0.022*"orange" + 0.021*"cancel" + 0.021*"negitive"'),
 (6,
  '0.157*"sort" + 0.067*"salvation" + 0.046*"resist" + 0.032*"libtards" + 0.029*"zal"'),
 (21,
  '0.051*"doh" + 0.047*"en" + 0.026*"meme" + 0.025*"grateful" + 0.023*"migration"'),
 (13,
  '0.020*"evictthepress" + 0.019*"tlot" + 0.019*"modern" + 0.018*"wavelength" + 0.018*"music"'),
 (12,
  '0.096*"borders" + 0.017*"ap" + 0.016*"question" + 0.016*"erasing" + 0.015*"drop"'),
 (29,
  '0.085*"activist" + 0.081*"idiocy" + 0.073*"yo" + 0.069*"inaug" + 0.047*"wynn"'),
 (7,
  '0.022*"uh" + 0.021*"trailer" + 0.020*"manieren" + 0.019*"tax" + 0.019*"enforce"')]



In [31]:

    
#display the model with pyLDAvis for exploration
twitter_data_for_vis =  gensimvis.prepare(lda_model, tweets_corpus, tweets_dictionary)
pyLDAvis.display(twitter_data_for_vis)









    



<pyLDAvis.utils.NumPyEncoder object at 0x1131a52e8> 30
<pyLDAvis.utils.NumPyEncoder object at 0x1131a52e8> 7
<pyLDAvis.utils.NumPyEncoder object at 0x1131a52e8> 1
<pyLDAvis.utils.NumPyEncoder object at 0x1131a52e8> 28
<pyLDAvis.utils.NumPyEncoder object at 0x1131a52e8> 16
<pyLDAvis.utils.NumPyEncoder object at 0x1131a52e8> 20
<pyLDAvis.utils.NumPyEncoder object at 0x1131a52e8> 13
<pyLDAvis.utils.NumPyEncoder object at 0x1131a52e8> 5
<pyLDAvis.utils.NumPyEncoder object at 0x1131a52e8> 6
<pyLDAvis.utils.NumPyEncoder object at 0x1131a52e8> 21
<pyLDAvis.utils.NumPyEncoder object at 0x1131a52e8> 19
<pyLDAvis.utils.NumPyEncoder object at 0x1131a52e8> 9
<pyLDAvis.utils.NumPyEncoder object at 0x1131a52e8> 15
<pyLDAvis.utils.NumPyEncoder object at 0x1131a52e8> 25
<pyLDAvis.utils.NumPyEncoder object at 0x1131a52e8> 8
<pyLDAvis.utils.NumPyEncoder object at 0x1131a52e8> 22
<pyLDAvis.utils.NumPyEncoder object at 0x1131a52e8> 10
<pyLDAvis.utils.NumPyEncoder object at 0x1131a52e8> 24
<pyLDAvis.utils.NumPyEncoder object at 0x1131a52e8> 2
<pyLDAvis.utils.NumPyEncoder object at 0x1131a52e8> 17
<pyLDAvis.utils.NumPyEncoder object at 0x1131a52e8> 11
<pyLDAvis.utils.NumPyEncoder object at 0x1131a52e8> 3
<pyLDAvis.utils.NumPyEncoder object at 0x1131a52e8> 12
<pyLDAvis.utils.NumPyEncoder object at 0x1131a52e8> 26
<pyLDAvis.utils.NumPyEncoder object at 0x1131a52e8> 29
<pyLDAvis.utils.NumPyEncoder object at 0x1131a52e8> 18
<pyLDAvis.utils.NumPyEncoder object at 0x1131a52e8> 23
<pyLDAvis.utils.NumPyEncoder object at 0x1131a52e8> 27
<pyLDAvis.utils.NumPyEncoder object at 0x1131a52e8> 14
<pyLDAvis.utils.NumPyEncoder object at 0x1131a52e8> 4






    Out[31]:



In [24]:









    Out[24]:





<module 'pyLDAvis.gensim' from '/Users/wwymak/anaconda/lib/python3.5/site-packages/pyLDAvis/gensim.py'>



In [23]:



In [ ]: