In [ ]:
%reset
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from pprint import pprint
Carga recursiva de tweets dado un directorio o fichero. Cada fichero contiene un número indeterminado de tweets por línea en formato JSON.
In [ ]:
import json
import os
#from os import listdir
import pandas as pd
#import time
def loadTweets(file_path, lang):
# We use the file saved from last step as example
tweets_file = open(file_path, "r")
tweets, ids = [], []
for line in tweets_file:
try:
# Read in one line of the file, convert it into a json object
tweet = json.loads(line)
if tweet['id'] not in ids and tweet['lang']==lang:
tweets.append(tweet)
ids.append(tweet['id'])
except:
# read in a line is not in JSON format (sometimes error occured)
continue
return tweets
def createTweetsDataframe(path, lang):
tweets, text, ids = [],[],[]
#created_at, user_mentions, hashtags = [],[],[]
if os.path.isfile(path) == True:
tweets = loadTweets(path, lang)
for tweet in tweets:
text.append(tweet['text'])
else:
for root, subdirs, files in os.walk(path):
for file in files:
#print(os.path.join(root, file))
tweets = loadTweets(os.path.join(root, file), lang)
for tweet in tweets:
if tweet['id'] not in ids:
#created_at.append(time.strftime('%Y-%m-%d %H:%M:%S', time.strptime(tweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y')))
#user_mentions.append([user_mention['screen_name'] for user_mention in tweet['entities']['user_mentions']])
#hashtags.append([hashtag['text'] for hashtag in tweet['entities']['hashtags']])
text.append(tweet['text'])
ids.append(tweet['id'])
#df = pd.DataFrame(data=[created_at, user_mentions, hashtags, text]).T
df = pd.DataFrame(data=[text]).T
#df.columns = ['created_at', 'user_mentions', 'hashtags', 'text']
df.columns = ['text']
#joiner = lambda x: ' '.join(x)
#df['user_mentions'] = df['user_mentions'].apply(joiner)
#df['hashtags'] = df['hashtags'].apply(joiner)
return df
Eliminación de stopwords para el idioma español (castellano) y obtención mediante stemming de la raíz de cada palabra.
In [ ]:
import nltk
print(nltk.__version__)
# To install the necessary NLTK packages
nltk.download()
In [ ]:
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from string import punctuation
def stopwordsES(add_stopwords):
non_words = list(punctuation)
non_words.extend([u'¡', u'¿'])
non_words.extend(add_stopwords)
stopwords_es = stopwords.words('spanish')
stopwords_es.extend(non_words)
return stopwords_es
def removeStopwords(text, stopwords):
tknzr = TweetTokenizer()
tokens = tknzr.tokenize(text.lower())
lower_stopwords = [stopword.lower() for stopword in stopwords]
text_stopwords_es = [token for token in tokens if token not in lower_stopwords and not token.startswith('http')]
return ' '.join(text_stopwords_es)
In [ ]:
from nltk.stem import SnowballStemmer
def snowballStemmerES():
stemmer = SnowballStemmer('spanish')
return stemmer
def stemming(text, stemmer):
stemmed = []
for token in text.split():
if token.startswith('@') or token.startswith('#'):
stemmed.append(token)
else:
stemmed.append(stemmer.stem(token))
return ' '.join(stemmed)
Función que llama a las anteriores pudiendo ejecutarse si se desea de manera paralela repartiendo el trabajo entre el número de procesadores.
In [ ]:
from joblib import Parallel, delayed
import multiprocessing
def preprocessTweetsDf(path, lang, stopwords, stemmer, intermediate_info, parallelize):
df = createTweetsDataframe(path, lang)
if parallelize == False:
df['text_stopwords'] = [removeStopwords(text, stopwords) for text in df['text']]
else:
processors = multiprocessing.cpu_count()
df['text_stopwords'] = Parallel(n_jobs=processors)(delayed(removeStopwords)(text, stopwords) for text in df['text'])
#if intermediate_info == False:
#del df['text']
if parallelize == False:
df['text_stemming'] = [stemming(text, stemmer) for text in df['text_stopwords']]
else:
df['text_stemming'] = Parallel(n_jobs=processors)(delayed(stemming)(text, stemmer) for text in df['text_stopwords'])
if intermediate_info == False:
del df['text_stopwords']
return df
print('Number of processors:', multiprocessing.cpu_count())
Al recolectar los tweets en la jornada electoral española del 26 de junio de 2016, los hashtags #26J, #EleccionesGenerales2016 y #L6elecciones fueron indicados para la recolección por lo que la gran mayoría los tienen y no aportan distinción entre unos textos y otros, así que son añadidos como stopwords a filtrar. Del mismo modo, y por su irrelevancia los tokens “RT” y dos maneras de codificar los puntos suspensivos también son añadidos como stopwords.
In [ ]:
path = '/home/user/Escritorio/2016/06/26/23'
add_stopwords = [u'RT', u'…', u'...', u'#26J', u'#EleccionesGenerales2016', u'#L6elecciones']
df = preprocessTweetsDf(path=path, lang='es', stopwords=stopwordsES(add_stopwords), stemmer=snowballStemmerES(), intermediate_info=False, parallelize=True)
print(df.shape)
df.head()
In [ ]:
from gensim import corpora
def createDictionaryCorpusBOW(dataframe):
texts = [[word for word in text.split()] for text in dataframe['text_stemming']]
dictionary = corpora.Dictionary(texts)
corpus_bow = [dictionary.doc2bow(text) for text in texts]
return (dictionary, corpus_bow)
dictionary, corpus_bow = createDictionaryCorpusBOW(df)
#print(dictionary.token2id)
#pprint(corpus_bow)
In [ ]:
from gensim import models
In [ ]:
def tfidfModelCorpus(corpus):
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
return (tfidf, corpus_tfidf)
In [ ]:
tfidf, corpus_tfidf = tfidfModelCorpus(corpus_bow)
Para la creación del modelo LSI también puede utilizarse un corpus formado tras aplicar Term Frequency – Inverse Document Frequency (TF-IDF) al corpus bag of words calculado solo con las frecuencias de por cada tweet. TF-IDF tiene en cuenta para hayar un nuevo corpus no solo la frecuencia de los tokens sino también el número de documentos que utilizan un token. Para las pruebas de este proyecto se ha optado por utilizar el corpus bag of words con el objetivo de que ambos modelos partan de un mismo corpus.
In [ ]:
def lsiModelCorpus(corpus, num_topics, dictionary):
lsi = models.LsiModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)
corpus_lsi = lsi[corpus]
return (lsi, corpus_lsi)
In [ ]:
lsi, corpus_lsi = lsiModelCorpus(corpus=corpus_bow, num_topics=2, dictionary=dictionary)
#lsi, corpus_lsi = lsiModelCorpus(corpus=corpus_tfidf, num_topics=2, dictionary=dictionary)
pprint(lsi.print_topics())
In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from sklearn import metrics
from sklearn.cluster import KMeans
def plot2dimensionsLSI(corpus):
i = 0
X = np.zeros((len(corpus), 2))
for doc in corpus:
for j in range(len(doc)):
X[i][j] = doc[j][1]
i += 1
print(X.shape)
plt.title('Documents in the LSI space')
plt.xlabel('Dimension / Topic 1')
plt.ylabel('Dimension / Topic 2')
plt.scatter(X[:,0], X[:,1])
plot2dimensionsLSI(corpus_lsi)
In [ ]:
def ldaModelCorpus(corpus, num_topics, dictionary):
lda = models.ldamodel.LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)
corpus_lda = lda[corpus]
return (lda, corpus_lda)
In [ ]:
import pyLDAvis
from pyLDAvis import gensim as gensimvis
In [ ]:
lda, corpus_lda = ldaModelCorpus(corpus=corpus_bow, num_topics=2, dictionary=dictionary)
pprint(lda.print_topics())
pyLDAvis.enable_notebook()
gensimvis.prepare(lda, corpus_bow, dictionary)
In [ ]:
num_topics = 3
lda, corpus_lda = ldaModelCorpus(corpus=corpus_bow, num_topics=num_topics, dictionary=dictionary)
pprint(lda.print_topics(num_topics=num_topics))
pyLDAvis.enable_notebook()
gensimvis.prepare(lda, corpus_bow, dictionary)
In [ ]:
num_topics = 4
lda, corpus_lda = ldaModelCorpus(corpus=corpus_bow, num_topics=num_topics, dictionary=dictionary)
pprint(lda.print_topics(num_topics=num_topics))
pyLDAvis.enable_notebook()
gensimvis.prepare(lda, corpus_bow, dictionary)
In [ ]:
num_topics = 5
lda, corpus_lda = ldaModelCorpus(corpus=corpus_bow, num_topics=num_topics, dictionary=dictionary)
pprint(lda.print_topics(num_topics=num_topics))
pyLDAvis.enable_notebook()
gensimvis.prepare(lda, corpus_bow, dictionary)
In [ ]:
num_topics = 6
lda, corpus_lda = ldaModelCorpus(corpus=corpus_bow, num_topics=num_topics, dictionary=dictionary)
pprint(lda.print_topics(num_topics=num_topics))
pyLDAvis.enable_notebook()
gensimvis.prepare(lda, corpus_bow, dictionary)
In [ ]:
from gensim import similarities
def similarTexts(text, model, dictionary, corpus_model, num_best, stopwords, stemmer):
vec_bow = dictionary.doc2bow(stemming(removeStopwords(text, stopwords), stemmer).split())
vec_model = model[vec_bow]
index = similarities.MatrixSimilarity(corpus_model)
if num_best != None and num_best != 0:
index.num_best = num_best * 5 # Recover more to eliminate repeated tweets, RTs essentially
sims = index[vec_model]
last_text = ''
unique_sims = []
for sim in sims:
if sim[1] != last_text:
unique_sims.append(sim)
last_text = sim[1]
return unique_sims[:num_best]
def similarityQueries(texts, list_models, list_num_topics, num_best, dictionary, stopwords, stemmer):
for text in texts:
for num_topics in list_num_topics:
for m in list_models:
if m == 'LSI':
model, corpus = lsiModelCorpus(corpus=corpus_bow, num_topics=num_topics, dictionary=dictionary)
if m == 'LDA':
model, corpus = ldaModelCorpus(corpus=corpus_bow, num_topics=num_topics, dictionary=dictionary)
sims = similarTexts(text=text, model=model, dictionary=dictionary, corpus_model=corpus, num_best=num_best, stopwords=stopwords, stemmer=stemmer)
print('\n*****', '#Topics:', num_topics, '| Model:', m, '| Texto:', text)
for sim in sims:
print('----------\n', sim[1], '\n', df['text'][sim[0]])
print('====================')
In [ ]:
texts = ['@marianorajoy, besando a su mujer y saltando en el balcón de Génova tras la victoria del @PPopular',
'La colalición Unidos Podemos, @ahorapodemos e @iunida, se queda con la suma de los escaños obtenidos en las últimas elecciones',
'El @PSOE vuelve a fracasar en su segundo intento por gobernar y Pedro Sánchez ve peligrar su liderazgo',
'@PartidoPACMA saca más votos que Bildu y Coalición Canaria pero se queda sin escaño']
similarityQueries(texts=texts, list_models=['LSI','LDA'], list_num_topics=[4,5], num_best=5, dictionary=dictionary, stopwords=stopwordsES(add_stopwords), stemmer=snowballStemmerES())
In [ ]:
%load_ext memory_profiler
import datetime
paths = ['/home/user/Escritorio/2016/06/26/22', '/home/user/Escritorio/2016/06/26']
n_tweets, lsi_times, lda_times = [],[],[]
for path in paths:
df = preprocessTweetsDf(path, lang='es', stopwords=stopwordsES(add_stopwords), stemmer=snowballStemmerES(), intermediate_info=False, parallelize=True)
dictionary, corpus_bow = createDictionaryCorpusBOW(df)
n_tweets.append(df.shape[0])
print('\tLatent Semantic Indexing, #tweets =', n_tweets[-1])
start_time = datetime.datetime.now()
%memit lsi = models.LsiModel(corpus_bow, num_topics=4, id2word=dictionary)
lsi_times.append((datetime.datetime.now() - start_time).total_seconds())
print('\tLatent Dirichlet Allocation, #tweets =', n_tweets[-1])
start_time = datetime.datetime.now()
%memit lda = models.ldamodel.LdaModel(corpus_bow, num_topics=4, id2word=dictionary)
lda_times.append((datetime.datetime.now() - start_time).total_seconds())
for i in range(len(n_tweets)):
print('#tweets:', n_tweets[i], '| LSI time elapsed:', lsi_times[i], 'seconds', '| LDA time elapsed:', lda_times[i], 'seconds')
plt.plot(n_tweets, lsi_times, 'b--', n_tweets, lda_times, 'r--')