notebook.community

Edit and run



In [1]:

    
import pandas as pd
import os.path
#import textmining - breaks with float objects
from sklearn.feature_extraction.text import CountVectorizer

path = '../NYTimes_Data/'

dataSet = pd.read_csv(os.path.join(path, 'NYTimesBlogTrain.csv'))



In [2]:

    
dataSet.head()









    Out[2]:






  
    
      
      NewsDesk
      SectionName
      SubsectionName
      Headline
      Snippet
      Abstract
      WordCount
      PubDate
      Popular
      UniqueID
    
  
  
    
      0
      Business
      Crosswords/Games
      NaN
      More School Daze
      A puzzle from Ethan Cooper that reminds me tha...
      A puzzle from Ethan Cooper that reminds me tha...
      508
      2014-09-01 22:00:09
      1
      1
    
    
      1
      Culture
      Arts
      NaN
      New 96-Page Murakami Work Coming in December
      The Strange Library will arrive just three and...
      The Strange Library will arrive just three and...
      285
      2014-09-01 21:14:07
      0
      2
    
    
      2
      Business
      Business Day
      Dealbook
      Public Pension Funds Stay Mum on Corporate Expats
      Public pension funds have major stakes in Amer...
      Public pension funds have major stakes in Amer...
      1211
      2014-09-01 21:05:36
      0
      3
    
    
      3
      Business
      Business Day
      Dealbook
      Boot Camp for Bankers
      As they struggle to find new business to bolst...
      As they struggle to find new business to bolst...
      1405
      2014-09-01 20:43:34
      1
      4
    
    
      4
      Science
      Health
      NaN
      Of Little Help to Older Knees
      Middle-aged and older patients are unlikely to...
      Middle-aged and older patients are unlikely to...
      181
      2014-09-01 18:58:51
      1
      5



In [3]:

    
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize

#tknzr = StringTokenizer()
nltk.download('stopwords')
stop = stopwords.words('english')
# custom stopwords
#stop += ['<hashtag>', '<url>', '<allcaps>', '<number>', '<user>', '<repeat>', '<elong>', 'websummit']
dataSet["processed"] = ""









    



[nltk_data] Downloading package stopwords to /home/blaw/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



In [4]:

    
processed = []
for cnt, row in dataSet.iterrows():
    Abstract_Value = row['Abstract'] 
    if len(str(Abstract_Value)) > 0:
        parts = word_tokenize(str(Abstract_Value))
        clean = [i for i in parts if i not in stop]
    else:
        clean = ''
    processed.append(clean)
    #dataSet.at[cnt,'processed'] = clean



In [5]:

    
from sklearn.feature_extraction.text import TfidfVectorizer

ab_text = [data['Abstract'] for cnt,data in dataSet.iterrows()] # list of all tweet texts
ab_text_processed = [str.join(" ", data) for data in processed] # list of pre-processed tweet texts



In [6]:

    
ab_text_processed[1:2]









    Out[6]:





['The Strange Library arrive three half months Mr. Murakamis latest novel , Colorless Tsukuru Tazaki His Years Pilgrimage .']



In [7]:

    
vectorizer = TfidfVectorizer(min_df=4, max_features = 10000)
vz = vectorizer.fit_transform(ab_text_processed)
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))



In [8]:

    
#tfidf.keys()



In [9]:

    
print("asian: " + str(tfidf["asian"]))









    



asian: 7.70517999316



In [10]:

    
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=50, random_state=0)
svd_tfidf = svd.fit_transform(vz[:6532])



In [11]:

    
from sklearn.manifold import TSNE

tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
tsne_tfidf = tsne_model.fit_transform(svd_tfidf)









    



[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 1000 / 6532
[t-SNE] Computed conditional probabilities for sample 2000 / 6532
[t-SNE] Computed conditional probabilities for sample 3000 / 6532
[t-SNE] Computed conditional probabilities for sample 4000 / 6532
[t-SNE] Computed conditional probabilities for sample 5000 / 6532
[t-SNE] Computed conditional probabilities for sample 6000 / 6532
[t-SNE] Computed conditional probabilities for sample 6532 / 6532
[t-SNE] Mean sigma: 0.000000
[t-SNE] Error after 100 iterations with early exaggeration: 20.508029
[t-SNE] Error after 236 iterations: 2.424675



In [12]:

    
#from tornado import gen
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show
from bokeh.io import output_notebook

output_notebook()
plot_tfidf = figure(plot_width=900, plot_height=700, title="NY Times - plot (tf-idf)",
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)

plot_tfidf.scatter(x=tsne_tfidf[:,0], y=tsne_tfidf[:,1],
                    source=bp.ColumnDataSource({
                        "text": ab_text[:6532], 
                        "processed": ab_text_processed[:6532]
                    }))

hover = plot_tfidf.select(dict(type=HoverTool))
hover.tooltips={"text": "@text (processed: \"@processed\")"}
show(plot_tfidf)









    





    
        
        Loading BokehJS ...
    






    














    






    







    Out[12]:




<Bokeh Notebook handle for In[12]>



In [13]:

    
from sklearn.cluster import MiniBatchKMeans

num_clusters = 10
kmeans_model = MiniBatchKMeans(n_clusters=num_clusters, init='k-means++', n_init=1, 
                         init_size=1000, batch_size=1000, verbose=False, max_iter=1000)
kmeans = kmeans_model.fit(vz)
kmeans_clusters = kmeans.predict(vz)
kmeans_distances = kmeans.transform(vz)



In [15]:

    
sorted_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(num_clusters):
    print("Cluster %d:" % i)
    for j in sorted_centroids[i, :10]:
        print(' %s' % terms[j])
    print()









    



Cluster 0:
 international
 herald
 tribune
 archives
 highlights
 from
 what
 1914
 1939
 1964
()
Cluster 1:
 american
 north
 the
 million
 theaters
 weekend
 new
 last
 ballet
 estimated
()
Cluster 2:
 the
 new
 company
 times
 first
 artist
 said
 bank
 would
 one
()
Cluster 3:
 president
 obama
 obamas
 one
 vice
 former
 ask
 immigration
 question
 the
()
Cluster 4:
 york
 week
 fashion
 new
 scenes
 times
 diary
 photo
 past
 appeared
()
Cluster 5:
 day
 emailed
 newsroom
 clip
 managers
 daily
 senior
 executives
 report
 media
()
Cluster 6:
 republican
 washington
 senator
 senate
 series
 representative
 mcconnell
 latest
 mitch
 race
()
Cluster 7:
 photos
 hong
 kong
 turkey
 india
 ukraine
 syria
 democracy
 china
 israel
()
Cluster 8:
 white
 house
 obama
 president
 wednesday
 the
 fence
 collar
 ebola
 said
()
Cluster 9:
 new
 one
 in
 year
 said
 week
 can
 first
 two
 best
()



In [16]:

    
tsne_kmeans = tsne_model.fit_transform(kmeans_distances[:6532])









    



[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 1000 / 6532
[t-SNE] Computed conditional probabilities for sample 2000 / 6532
[t-SNE] Computed conditional probabilities for sample 3000 / 6532
[t-SNE] Computed conditional probabilities for sample 4000 / 6532
[t-SNE] Computed conditional probabilities for sample 5000 / 6532
[t-SNE] Computed conditional probabilities for sample 6000 / 6532
[t-SNE] Computed conditional probabilities for sample 6532 / 6532
[t-SNE] Mean sigma: 0.000000
[t-SNE] Error after 100 iterations with early exaggeration: 17.726824
[t-SNE] Error after 336 iterations: 2.178226



In [19]:

    
import numpy as np

colormap = np.array([
    "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c", 
    "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5", 
    "#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f", 
    "#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5"
])

plot_kmeans = bp.figure(plot_width=900, plot_height=700, title="NY-Times (k-means)",
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)

plot_kmeans.scatter(x=tsne_kmeans[:,0], y=tsne_kmeans[:,1], 
                    color=colormap[kmeans_clusters][:6532], 
                    source=bp.ColumnDataSource({
                        "Abstract": ab_text[:6532], 
                        "processed": ab_text_processed[:6532],
                        "cluster": kmeans_clusters[:6532]
                    }))
hover = plot_kmeans.select(dict(type=HoverTool))
hover.tooltips={"tweet": "@tweet (processed: \"@processed\" - cluster: @cluster)"}
show(plot_kmeans)









    






    







    Out[19]:




<Bokeh Notebook handle for In[19]>



In [21]:

    
import lda
from sklearn.feature_extraction.text import CountVectorizer

cvectorizer = CountVectorizer(min_df=4, max_features=10000, stop_words='english')
cvz = cvectorizer.fit_transform(ab_text_processed)

n_topics = 15
n_iter = 2000
lda_model = lda.LDA(n_topics=n_topics, n_iter=n_iter)
X_topics = lda_model.fit_transform(cvz)



In [22]:

    
n_top_words = 8
topic_summaries = []

topic_word = lda_model.topic_word_  # get the topic words
vocab = cvectorizer.get_feature_names()
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(' '.join(topic_words))
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))









    



Topic 0: states united writes photos reuters breakingviews news china
Topic 1: senate senator republican campaign democratic election gov new
Topic 2: week fashion diary scenes new york photo metropolitan
Topic 3: international herald archives tribune highlights war 1939 1964
Topic 4: new company business social online apple internet start
Topic 5: new mr year film old american discusses director
Topic 6: ebola students chinese china readers university world rights
Topic 7: new york times past year great appeared articles
Topic 8: day times media report collection executives daily senior
Topic 9: best choose video word blank time 10 album
Topic 10: new people make think study good time like
Topic 11: bank banks financial federal wall street said chief
Topic 12: new art share artist day museum home summer
Topic 13: president obama white house said state general united
Topic 14: company billion said million fund deal year percent



In [23]:

    
tsne_lda = tsne_model.fit_transform(X_topics[:6532])









    



[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 1000 / 6532
[t-SNE] Computed conditional probabilities for sample 2000 / 6532
[t-SNE] Computed conditional probabilities for sample 3000 / 6532
[t-SNE] Computed conditional probabilities for sample 4000 / 6532
[t-SNE] Computed conditional probabilities for sample 5000 / 6532
[t-SNE] Computed conditional probabilities for sample 6000 / 6532
[t-SNE] Computed conditional probabilities for sample 6532 / 6532
[t-SNE] Mean sigma: 0.000000
[t-SNE] Error after 100 iterations with early exaggeration: 18.997276
[t-SNE] Error after 340 iterations: 2.406251



In [24]:

    
doc_topic = lda_model.doc_topic_
lda_keys = []
for i, row in enumerate(dataSet):
    lda_keys += [doc_topic[i].argmax()]



In [25]:

    
plot_lda = bp.figure(plot_width=900, plot_height=700, title="NY - Times (LDA)",
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)

plot_lda.scatter(x=tsne_lda[:,0], y=tsne_lda[:,1], 
                 color=colormap[lda_keys][:10000], 
                 source=bp.ColumnDataSource({
                    "Text": ab_text[:6532], 
                    "processed": ab_text_processed[:6532],
                    "topic_key": lda_keys[:6532]
                }))
hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips={"Text": "@text (processed: \"@processed\" - topic: @topic_key)"}
show(plot_lda)









    






    







    Out[25]:




<Bokeh Notebook handle for In[25]>

	NewsDesk	SectionName	SubsectionName	Headline	Snippet	Abstract	WordCount	PubDate	Popular	UniqueID
0	Business	Crosswords/Games	NaN	More School Daze	A puzzle from Ethan Cooper that reminds me tha...	A puzzle from Ethan Cooper that reminds me tha...	508	2014-09-01 22:00:09	1	1
1	Culture	Arts	NaN	New 96-Page Murakami Work Coming in December	The Strange Library will arrive just three and...	The Strange Library will arrive just three and...	285	2014-09-01 21:14:07	0	2
2	Business	Business Day	Dealbook	Public Pension Funds Stay Mum on Corporate Expats	Public pension funds have major stakes in Amer...	Public pension funds have major stakes in Amer...	1211	2014-09-01 21:05:36	0	3
3	Business	Business Day	Dealbook	Boot Camp for Bankers	As they struggle to find new business to bolst...	As they struggle to find new business to bolst...	1405	2014-09-01 20:43:34	1	4
4	Science	Health	NaN	Of Little Help to Older Knees	Middle-aged and older patients are unlikely to...	Middle-aged and older patients are unlikely to...	181	2014-09-01 18:58:51	1	5