notebook.community

Edit and run



In [2]:

    
# http://scikit-learn.org/stable/auto_examples/applications/topics_extraction_with_nmf_lda.html

from __future__ import print_function
from time import time

import pandas
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation  # v. 0.17.1
from sklearn.datasets import fetch_20newsgroups



In [3]:

    
def csv_to_df(csv_file):
    """Open csv, return Pandas DataFrame."""
    dataframe = pandas.read_csv(csv_file, 
                             delimiter='|', 
                             error_bad_lines=False, 
                             warn_bad_lines=False,
                            )
    return dataframe



In [4]:

    
n_samples = 2000
n_features = 1000
n_topics = 10
n_top_words = 20

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()



In [5]:

    
dataframe_popular = csv_to_df('../tweets/tweets_popular.csv')



In [6]:

    
data_samples = dataframe_popular['_text'].tolist()  # list of str



In [7]:

    
# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, #max_features=n_features,
                                   stop_words='english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))









    



Extracting tf-idf features for NMF...
done in 0.158s.



In [8]:

    
data_samples[:3]









    Out[8]:





['@CringeLMAO: Easy there m8 https://t.co/dnF3Wqdt1C',
 '@AustinMahone: Just posted a photo https://t.co/hXFg6TyuzE',
 "@Ashton5SOS: Some days I drink way to much coffee and fill your Twitter feeds with stupid replies and pointless videos, I ain't sorry ok"]



In [9]:

    
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))









    



Extracting tf features for LDA...
done in 0.212s.



In [10]:

    
# Fit the NMF model
print("Fitting the NMF model with tf-idf features,"
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_topics, 
          random_state=1,
          alpha=.1, 
          l1_ratio=.5).fit(tfidf)
exit()
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)









    



Fitting the NMF model with tf-idf features,n_samples=2000 and n_features=1000...
done in 0.242s.

Topics in NMF model:
Topic #0:
https day new time rt want sexualgif justin grammy happy oh bieber people girl goals amp fuck black grammys performance
Topic #1:
love justinbieber beliebers ready did grammys performance tonight 3zhzx54wr0 thank remember riuhvhdg8z stage mylovaticsaremylife believed mtvema ddlovato billboard diplo skrillex
Topic #2:
im ice cube snapchat dead worldstarfunny cae4iqzfwg https crying lom9ctefao besideyoustyles deadhdj playfuily etricnhsxz j0hbgendly woridstarhiph0p oy2wfefsyp xl8g2p64sz freddyamazin woridstarcomedy
Topic #3:
life djkingassassin warriors wyclef level struggle matter maddi_says headsaudio bumpin groovin trying girl new rest forward want looking camerondallas tweetlikeagiri
Topic #4:
gets watch funnier woridstarhiph0p hndwodypak https t3ttxk1icp worldstarfunny lzibp2skdl niggacommentary qua1oapdyy woridstarcomedy gdevfhnzbl atp6wvvgul fillwerrell dory time kc5ieskdr8 4kthm0lrkq woridstarhiphop
Topic #5:
kanyewest debt puts americans education started chance fuck greatness shut enjoy struggle level matter dream god likable worried aight people
Topic #6:
thegrammys grammys taylorswift13 congrats best album screaming pop vocal 1989 6gqbpr2jmw ladygaga skrillex diplo justinbieber performance intelinmusic kendricklamar amp winning
Topic #7:
need carterreynolds good pic mom date boys fish pics instagram work keeping god oh caught dog sodamntrue nap graysondolan caug
Topic #8:
like just don looks love internallyiost realized look people talk welcome start olive garden girl drink hi know http understand
Topic #9:
kendrick lamar beyonc reactionbeyonce toqfog9yqt artistic inspiration creative jackjackjohnson mind performance grammys 2016 tribecalledgod h0vzipci7v blacklivesmatter gwq6viygmd onlyhiphopfacts https grammy



In [1]:

    
'''
print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)
'''









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-98b48b48bee3> in <module>()
      1 print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..."
----> 2       % (n_samples, n_features))
      3 lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
      4                                 learning_method='online', learning_offset=50.,
      5                                 random_state=0)

NameError: name 'n_samples' is not defined



In [ ]:



In [ ]: