In [2]:
# http://scikit-learn.org/stable/auto_examples/applications/topics_extraction_with_nmf_lda.html

from __future__ import print_function
from time import time

import pandas
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation  # v. 0.17.1
from sklearn.datasets import fetch_20newsgroups

In [3]:
def csv_to_df(csv_file):
    """Open csv, return Pandas DataFrame."""
    dataframe = pandas.read_csv(csv_file, 
                             delimiter='|', 
                             error_bad_lines=False, 
                             warn_bad_lines=False,
                            )
    return dataframe

In [4]:
n_samples = 2000
n_features = 1000
n_topics = 10
n_top_words = 20

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [5]:
dataframe_popular = csv_to_df('../tweets/tweets_popular.csv')

In [6]:
data_samples = dataframe_popular['_text'].tolist()  # list of str

In [7]:
# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, #max_features=n_features,
                                   stop_words='english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))


Extracting tf-idf features for NMF...
done in 0.158s.

In [8]:
data_samples[:3]


Out[8]:
['@CringeLMAO: Easy there m8 https://t.co/dnF3Wqdt1C',
 '@AustinMahone: Just posted a photo https://t.co/hXFg6TyuzE',
 "@Ashton5SOS: Some days I drink way to much coffee and fill your Twitter feeds with stupid replies and pointless videos, I ain't sorry ok"]

In [9]:
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))


Extracting tf features for LDA...
done in 0.212s.

In [10]:
# Fit the NMF model
print("Fitting the NMF model with tf-idf features,"
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_topics, 
          random_state=1,
          alpha=.1, 
          l1_ratio=.5).fit(tfidf)
exit()
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)


Fitting the NMF model with tf-idf features,n_samples=2000 and n_features=1000...
done in 0.242s.

Topics in NMF model:
Topic #0:
https day new time rt want sexualgif justin grammy happy oh bieber people girl goals amp fuck black grammys performance
Topic #1:
love justinbieber beliebers ready did grammys performance tonight 3zhzx54wr0 thank remember riuhvhdg8z stage mylovaticsaremylife believed mtvema ddlovato billboard diplo skrillex
Topic #2:
im ice cube snapchat dead worldstarfunny cae4iqzfwg https crying lom9ctefao besideyoustyles deadhdj playfuily etricnhsxz j0hbgendly woridstarhiph0p oy2wfefsyp xl8g2p64sz freddyamazin woridstarcomedy
Topic #3:
life djkingassassin warriors wyclef level struggle matter maddi_says headsaudio bumpin groovin trying girl new rest forward want looking camerondallas tweetlikeagiri
Topic #4:
gets watch funnier woridstarhiph0p hndwodypak https t3ttxk1icp worldstarfunny lzibp2skdl niggacommentary qua1oapdyy woridstarcomedy gdevfhnzbl atp6wvvgul fillwerrell dory time kc5ieskdr8 4kthm0lrkq woridstarhiphop
Topic #5:
kanyewest debt puts americans education started chance fuck greatness shut enjoy struggle level matter dream god likable worried aight people
Topic #6:
thegrammys grammys taylorswift13 congrats best album screaming pop vocal 1989 6gqbpr2jmw ladygaga skrillex diplo justinbieber performance intelinmusic kendricklamar amp winning
Topic #7:
need carterreynolds good pic mom date boys fish pics instagram work keeping god oh caught dog sodamntrue nap graysondolan caug
Topic #8:
like just don looks love internallyiost realized look people talk welcome start olive garden girl drink hi know http understand
Topic #9:
kendrick lamar beyonc reactionbeyonce toqfog9yqt artistic inspiration creative jackjackjohnson mind performance grammys 2016 tribecalledgod h0vzipci7v blacklivesmatter gwq6viygmd onlyhiphopfacts https grammy


In [1]:
'''
print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)
'''


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-98b48b48bee3> in <module>()
      1 print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..."
----> 2       % (n_samples, n_features))
      3 lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
      4                                 learning_method='online', learning_offset=50.,
      5                                 random_state=0)

NameError: name 'n_samples' is not defined

In [ ]:


In [ ]: