Preprocessing tweets


In [ ]:
%reset

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

from pprint import pprint

Carga recursiva de tweets dado un directorio o fichero. Cada fichero contiene un número indeterminado de tweets por línea en formato JSON.


In [ ]:
import json
import os
#from os import listdir
import pandas as pd
#import time

def loadTweets(file_path, lang):
    # We use the file saved from last step as example
    tweets_file = open(file_path, "r")
    tweets, ids = [], []
    
    for line in tweets_file:
        try:
            # Read in one line of the file, convert it into a json object
            tweet = json.loads(line)
            if tweet['id'] not in ids and tweet['lang']==lang:
                tweets.append(tweet)
                ids.append(tweet['id'])
        except:
            # read in a line is not in JSON format (sometimes error occured)
            continue
            
    return tweets

def createTweetsDataframe(path, lang):
    tweets, text, ids = [],[],[]
    #created_at, user_mentions, hashtags = [],[],[]
    
    if os.path.isfile(path) == True:
        tweets = loadTweets(path, lang)
        for tweet in tweets:
            text.append(tweet['text'])
    else:
        for root, subdirs, files in os.walk(path):
            for file in files:
                #print(os.path.join(root, file))
                tweets = loadTweets(os.path.join(root, file), lang)
                for tweet in tweets:
                    if tweet['id'] not in ids:
                        #created_at.append(time.strftime('%Y-%m-%d %H:%M:%S', time.strptime(tweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y')))
                        #user_mentions.append([user_mention['screen_name'] for user_mention in tweet['entities']['user_mentions']])
                        #hashtags.append([hashtag['text'] for hashtag in tweet['entities']['hashtags']])
                        text.append(tweet['text'])
                        ids.append(tweet['id'])
        
    #df = pd.DataFrame(data=[created_at, user_mentions, hashtags, text]).T
    df = pd.DataFrame(data=[text]).T
    #df.columns = ['created_at', 'user_mentions', 'hashtags', 'text']
    df.columns = ['text']

    #joiner = lambda x: ' '.join(x)
    #df['user_mentions'] = df['user_mentions'].apply(joiner)
    #df['hashtags'] = df['hashtags'].apply(joiner)
    
    return df

Eliminación de stopwords para el idioma español (castellano) y obtención mediante stemming de la raíz de cada palabra.


In [ ]:
import nltk
print(nltk.__version__)

# To install the necessary NLTK packages
nltk.download()

In [ ]:
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from string import punctuation

def stopwordsES(add_stopwords):
    non_words = list(punctuation)
    non_words.extend([u'¡', u'¿'])
    non_words.extend(add_stopwords)
    stopwords_es = stopwords.words('spanish')
    stopwords_es.extend(non_words)
    return stopwords_es

def removeStopwords(text, stopwords):
    tknzr = TweetTokenizer()
    tokens = tknzr.tokenize(text.lower())
    lower_stopwords = [stopword.lower() for stopword in stopwords]
    text_stopwords_es = [token for token in tokens if token not in lower_stopwords and not token.startswith('http')]
    return ' '.join(text_stopwords_es)

In [ ]:
from nltk.stem import SnowballStemmer

def snowballStemmerES():
    stemmer = SnowballStemmer('spanish')
    return stemmer

def stemming(text, stemmer):
    stemmed = []
    for token in text.split():
        if token.startswith('@') or token.startswith('#'):
            stemmed.append(token)
        else:
            stemmed.append(stemmer.stem(token))
    return ' '.join(stemmed)

Función que llama a las anteriores pudiendo ejecutarse si se desea de manera paralela repartiendo el trabajo entre el número de procesadores.


In [ ]:
from joblib import Parallel, delayed
import multiprocessing

def preprocessTweetsDf(path, lang, stopwords, stemmer, intermediate_info, parallelize):
    df = createTweetsDataframe(path, lang)
    
    if parallelize == False:
        df['text_stopwords'] = [removeStopwords(text, stopwords) for text in df['text']]
    else:
        processors = multiprocessing.cpu_count()
        df['text_stopwords'] = Parallel(n_jobs=processors)(delayed(removeStopwords)(text, stopwords) for text in df['text'])
        
    #if intermediate_info == False:
        #del df['text']
    
    if parallelize == False:
        df['text_stemming'] = [stemming(text, stemmer) for text in df['text_stopwords']]
    else:
        df['text_stemming'] = Parallel(n_jobs=processors)(delayed(stemming)(text, stemmer) for text in df['text_stopwords'])
        
    if intermediate_info == False:
        del df['text_stopwords']
        
    return df

print('Number of processors:', multiprocessing.cpu_count())

Al recolectar los tweets en la jornada electoral española del 26 de junio de 2016, los hashtags #26J, #EleccionesGenerales2016 y #L6elecciones fueron indicados para la recolección por lo que la gran mayoría los tienen y no aportan distinción entre unos textos y otros, así que son añadidos como stopwords a filtrar. Del mismo modo, y por su irrelevancia los tokens “RT” y dos maneras de codificar los puntos suspensivos también son añadidos como stopwords.


In [ ]:
path = '/home/user/Escritorio/2016/06/26/23'
add_stopwords = [u'RT', u'…', u'...', u'#26J', u'#EleccionesGenerales2016', u'#L6elecciones']

df = preprocessTweetsDf(path=path, lang='es', stopwords=stopwordsES(add_stopwords), stemmer=snowballStemmerES(), intermediate_info=False, parallelize=True)
print(df.shape)
df.head()

Topic modelling

Corpora and Vector Spaces

Creación del diccionario en el que los tokens son mapeados con números, devolviendo el corpus bag of words creado con la frecuencia de cada token por tweet.


In [ ]:
from gensim import corpora

def createDictionaryCorpusBOW(dataframe):
    texts = [[word for word in text.split()] for text in dataframe['text_stemming']]
    dictionary = corpora.Dictionary(texts)
    corpus_bow = [dictionary.doc2bow(text) for text in texts]
    return (dictionary, corpus_bow)

dictionary, corpus_bow = createDictionaryCorpusBOW(df)
#print(dictionary.token2id)
#pprint(corpus_bow)

Topics and Transformations


In [ ]:
from gensim import models

TF-IDF, Term Frequency - Inverse Document Frequency


In [ ]:
def tfidfModelCorpus(corpus):
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    return (tfidf, corpus_tfidf)

In [ ]:
tfidf, corpus_tfidf = tfidfModelCorpus(corpus_bow)

LSI, Latent Semantic Indexing

Para la creación del modelo LSI también puede utilizarse un corpus formado tras aplicar Term Frequency – Inverse Document Frequency (TF-IDF) al corpus bag of words calculado solo con las frecuencias de por cada tweet. TF-IDF tiene en cuenta para hayar un nuevo corpus no solo la frecuencia de los tokens sino también el número de documentos que utilizan un token. Para las pruebas de este proyecto se ha optado por utilizar el corpus bag of words con el objetivo de que ambos modelos partan de un mismo corpus.


In [ ]:
def lsiModelCorpus(corpus, num_topics, dictionary):
    lsi = models.LsiModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)
    corpus_lsi = lsi[corpus]
    return (lsi, corpus_lsi)

In [ ]:
lsi, corpus_lsi = lsiModelCorpus(corpus=corpus_bow, num_topics=2, dictionary=dictionary)
#lsi, corpus_lsi = lsiModelCorpus(corpus=corpus_tfidf, num_topics=2, dictionary=dictionary)
pprint(lsi.print_topics())

In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from sklearn import metrics
from sklearn.cluster import KMeans

def plot2dimensionsLSI(corpus):
    i = 0
    X = np.zeros((len(corpus), 2))
    for doc in corpus:
        for j in range(len(doc)):
            X[i][j] = doc[j][1]
        i += 1
    print(X.shape)

    plt.title('Documents in the LSI space')
    plt.xlabel('Dimension / Topic 1')
    plt.ylabel('Dimension / Topic 2')
    plt.scatter(X[:,0], X[:,1])
    
plot2dimensionsLSI(corpus_lsi)

LDA, Latent Dirichlet Allocation


In [ ]:
def ldaModelCorpus(corpus, num_topics, dictionary):
    lda = models.ldamodel.LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)
    corpus_lda = lda[corpus]
    return (lda, corpus_lda)

In [ ]:
import pyLDAvis
from pyLDAvis import gensim as gensimvis

In [ ]:
lda, corpus_lda = ldaModelCorpus(corpus=corpus_bow, num_topics=2, dictionary=dictionary)
pprint(lda.print_topics())

pyLDAvis.enable_notebook()
gensimvis.prepare(lda, corpus_bow, dictionary)

Number of Topics with Latent Dirichlet Allocation


In [ ]:
num_topics = 3
lda, corpus_lda = ldaModelCorpus(corpus=corpus_bow, num_topics=num_topics, dictionary=dictionary)
pprint(lda.print_topics(num_topics=num_topics))

pyLDAvis.enable_notebook()
gensimvis.prepare(lda, corpus_bow, dictionary)

In [ ]:
num_topics = 4
lda, corpus_lda = ldaModelCorpus(corpus=corpus_bow, num_topics=num_topics, dictionary=dictionary)
pprint(lda.print_topics(num_topics=num_topics))

pyLDAvis.enable_notebook()
gensimvis.prepare(lda, corpus_bow, dictionary)

In [ ]:
num_topics = 5
lda, corpus_lda = ldaModelCorpus(corpus=corpus_bow, num_topics=num_topics, dictionary=dictionary)
pprint(lda.print_topics(num_topics=num_topics))

pyLDAvis.enable_notebook()
gensimvis.prepare(lda, corpus_bow, dictionary)

In [ ]:
num_topics = 6
lda, corpus_lda = ldaModelCorpus(corpus=corpus_bow, num_topics=num_topics, dictionary=dictionary)
pprint(lda.print_topics(num_topics=num_topics))

pyLDAvis.enable_notebook()
gensimvis.prepare(lda, corpus_bow, dictionary)

Similarity Queries

Cálculo de similitud de cuatro textos que son transformados al espacio LSI y al de LDA para obtener los tweets más similares que devuelve cada uno.


In [ ]:
from gensim import similarities

def similarTexts(text, model, dictionary, corpus_model, num_best, stopwords, stemmer):
    vec_bow = dictionary.doc2bow(stemming(removeStopwords(text, stopwords), stemmer).split())
    vec_model = model[vec_bow]
    
    index = similarities.MatrixSimilarity(corpus_model)
    if num_best != None and num_best != 0:
        index.num_best = num_best * 5 # Recover more to eliminate repeated tweets, RTs essentially
    sims = index[vec_model]
    
    last_text = ''
    unique_sims = []
    for sim in sims:
        if sim[1] != last_text:
            unique_sims.append(sim)
            last_text = sim[1]

    return unique_sims[:num_best]

def similarityQueries(texts, list_models, list_num_topics, num_best, dictionary, stopwords, stemmer):
    for text in texts:
        for num_topics in list_num_topics:
            for m in list_models:
                if m == 'LSI':
                    model, corpus = lsiModelCorpus(corpus=corpus_bow, num_topics=num_topics, dictionary=dictionary)
                if m == 'LDA':
                    model, corpus = ldaModelCorpus(corpus=corpus_bow, num_topics=num_topics, dictionary=dictionary)

                sims = similarTexts(text=text, model=model, dictionary=dictionary, corpus_model=corpus, num_best=num_best, stopwords=stopwords, stemmer=stemmer)

                print('\n*****', '#Topics:', num_topics, '| Model:', m, '| Texto:', text)
                for sim in sims:
                    print('----------\n', sim[1], '\n', df['text'][sim[0]])
                print('====================')

In [ ]:
texts = ['@marianorajoy, besando a su mujer y saltando en el balcón de Génova tras la victoria del @PPopular',
         'La colalición Unidos Podemos, @ahorapodemos e @iunida, se queda con la suma de los escaños obtenidos en las últimas elecciones',
         'El @PSOE vuelve a fracasar en su segundo intento por gobernar y Pedro Sánchez ve peligrar su liderazgo',
         '@PartidoPACMA saca más votos que Bildu y Coalición Canaria pero se queda sin escaño']

similarityQueries(texts=texts, list_models=['LSI','LDA'], list_num_topics=[4,5], num_best=5, dictionary=dictionary, stopwords=stopwordsES(add_stopwords), stemmer=snowballStemmerES())

Memory Peaks and Time Elapsed

Cálculo del tiempo que tardan ambos modelos en crearse, así como los picos de memoria consumidos.


In [ ]:
%load_ext memory_profiler
import datetime

paths = ['/home/user/Escritorio/2016/06/26/22', '/home/user/Escritorio/2016/06/26']

n_tweets, lsi_times, lda_times = [],[],[]
for path in paths:
    df = preprocessTweetsDf(path, lang='es', stopwords=stopwordsES(add_stopwords), stemmer=snowballStemmerES(), intermediate_info=False, parallelize=True)
    dictionary, corpus_bow = createDictionaryCorpusBOW(df)
    n_tweets.append(df.shape[0])

    print('\tLatent Semantic Indexing, #tweets =', n_tweets[-1])
    start_time = datetime.datetime.now()
    %memit lsi = models.LsiModel(corpus_bow, num_topics=4, id2word=dictionary)
    lsi_times.append((datetime.datetime.now() - start_time).total_seconds())

    print('\tLatent Dirichlet Allocation, #tweets =', n_tweets[-1])
    start_time = datetime.datetime.now()
    %memit lda = models.ldamodel.LdaModel(corpus_bow, num_topics=4, id2word=dictionary)
    lda_times.append((datetime.datetime.now() - start_time).total_seconds())
    
for i in range(len(n_tweets)):
    print('#tweets:', n_tweets[i], '| LSI time elapsed:', lsi_times[i], 'seconds', '| LDA time elapsed:', lda_times[i], 'seconds')
    
plt.plot(n_tweets, lsi_times, 'b--', n_tweets, lda_times, 'r--')