In [1]:
# some nlp tools
import spacy
import gensim
import pyLDAvis.gensim as gensimvis
import pyLDAvis
In [2]:
# tools to get data from s3
import boto3
from io import StringIO
import json
import requests
In [3]:
# would probably need these
import numpy as np
import pandas as pd
import itertools
In [4]:
nlp = spacy.load('en')
To connect to S3 programmatically (or any other AWS service), the official SDK for python is boto Remember to use a profile that has access rights to that bucket
In [5]:
boto_session = boto3.session.Session(profile_name = 'wwymakAdmin')
s3 = boto_session.resource('s3')
def to_StringIO(key, bucket_name, s3 = s3):
obj = s3.Object(bucket_name, key)
return StringIO(obj.get()['Body'].read())
bkt = 'discursive'
# you can change default here to be whatever profile works on your server for listing the dicursive bucket
s = boto3.session.Session(profile_name='wwymakAdmin')
s3 = s.resource('s3')
bucket = s3.Bucket(bkt)
In [24]:
# file format is YYYY/M/DD/HH/filename.json
prefix = '2017/1/16/'
# the base bucket name for the discursive project
S3_BASE_URL = 'https://s3-us-west-2.amazonaws.com/discursive/'
Use spacy to tokenize tweets, then construct the dictionary and corpus from it using gensim
In [23]:
def parse_tweets(tweets_arr, tweets_dictionary, tweets_corpus, all_tweet_tokens, tweets_dictionary_filepath, tweets_corpus_filepath):
#remove the retweets
tweets_text_documents = [x['text'] for x in tweets_arr if not str.startswith(str.lower(x['text']), 'rt')]
tokenized_tweets = []
# Process tweets using Spacy NLP pipeline.
for doc in nlp.pipe(tweets_text_documents, n_threads=4, batch_size=100):
ents = doc.ents # Named entities.
# Keep only words (no numbers, no punctuation).
# Lemmatize tokens, remove punctuation and remove stopwords.
doc = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
# Remove common words from a stopword list.
#doc = [token for token in doc if token not in STOPWORDS]
# Add named entities, but only if they are a compound of more than word.
#doc.extend([str(entity) for entity in ents if len(entity) > 1])
tokenized_tweets.append(doc)
dictionary = gensim.corpora.Dictionary(tokenized_tweets)
corpus = [dictionary.doc2bow(x) for x in tokenized_tweets]
# print(len(corpus))
all_tweet_tokens.extend(tokenized_tweets)
tweets_dictionary = tweets_dictionary.merge_with(dictionary)
#save current dict
tweets_dictionary.save(tweets_dictionary_filepath)
tweets_corpus.extend(corpus)
# save current corpus
gensim.corpora.MmCorpus.serialize(tweets_corpus_filepath, tweets_corpus)
In [25]:
#initialise empty dictionary
tweets_dictionary = gensim.corpora.Dictionary([])
tweets_corpus = []
all_tweets_tokens = []
for obj in bucket.objects.filter(Prefix=prefix):
filename = obj.key
r = requests.get(S3_BASE_URL + filename)
#make sure only to execut if response is successful
if(r.status_code == 200):
parse_tweets(r.json(), tweets_dictionary,tweets_corpus, all_tweets_tokens, 'tweets_dictionary.dict', 'tweets_corpus.mm')
print(filename)
If you already have a corpus and dictionary you can just load them.
In [13]:
current_corpus = gensim.corpora.MmCorpus('tweets_corpus.mm')
current_dictionary = gensim.corpora.Dictionary.load('tweets_dictionary.dict')
In [27]:
len(tweets_corpus)
Out[27]:
In [28]:
#train gensim lda model --no optimsation at the mo. Just experimenting
lda_model = gensim.models.LdaModel(tweets_corpus, id2word=tweets_dictionary, num_topics=30)
In [29]:
`#save model to file
lda_model.save('twitter_lda.model')
In [30]:
#print some topics to see if they make any sense...
lda_model.print_topics(10, 5)
Out[30]:
In [31]:
#display the model with pyLDAvis for exploration
twitter_data_for_vis = gensimvis.prepare(lda_model, tweets_corpus, tweets_dictionary)
pyLDAvis.display(twitter_data_for_vis)
Out[31]:
In [24]:
Out[24]:
In [23]:
In [ ]: