In [1]:
import pandas as pd
import os.path
#import textmining - breaks with float objects
from sklearn.feature_extraction.text import CountVectorizer

path = '../NYTimes_Data/'

dataSet = pd.read_csv(os.path.join(path, 'NYTimesBlogTrain.csv'))

In [2]:
dataSet.head()


Out[2]:
NewsDesk SectionName SubsectionName Headline Snippet Abstract WordCount PubDate Popular UniqueID
0 Business Crosswords/Games NaN More School Daze A puzzle from Ethan Cooper that reminds me tha... A puzzle from Ethan Cooper that reminds me tha... 508 2014-09-01 22:00:09 1 1
1 Culture Arts NaN New 96-Page Murakami Work Coming in December The Strange Library will arrive just three and... The Strange Library will arrive just three and... 285 2014-09-01 21:14:07 0 2
2 Business Business Day Dealbook Public Pension Funds Stay Mum on Corporate Expats Public pension funds have major stakes in Amer... Public pension funds have major stakes in Amer... 1211 2014-09-01 21:05:36 0 3
3 Business Business Day Dealbook Boot Camp for Bankers As they struggle to find new business to bolst... As they struggle to find new business to bolst... 1405 2014-09-01 20:43:34 1 4
4 Science Health NaN Of Little Help to Older Knees Middle-aged and older patients are unlikely to... Middle-aged and older patients are unlikely to... 181 2014-09-01 18:58:51 1 5

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize

#tknzr = StringTokenizer()
nltk.download('stopwords')
stop = stopwords.words('english')
# custom stopwords
#stop += ['<hashtag>', '<url>', '<allcaps>', '<number>', '<user>', '<repeat>', '<elong>', 'websummit']
dataSet["processed"] = ""


[nltk_data] Downloading package stopwords to /home/blaw/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

In [4]:
processed = []
for cnt, row in dataSet.iterrows():
    Abstract_Value = row['Abstract'] 
    if len(str(Abstract_Value)) > 0:
        parts = word_tokenize(str(Abstract_Value))
        clean = [i for i in parts if i not in stop]
    else:
        clean = ''
    processed.append(clean)
    #dataSet.at[cnt,'processed'] = clean

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

ab_text = [data['Abstract'] for cnt,data in dataSet.iterrows()] # list of all tweet texts
ab_text_processed = [str.join(" ", data) for data in processed] # list of pre-processed tweet texts

In [6]:
ab_text_processed[1:2]


Out[6]:
['The Strange Library arrive three half months Mr. Murakamis latest novel , Colorless Tsukuru Tazaki His Years Pilgrimage .']

In [7]:
vectorizer = TfidfVectorizer(min_df=4, max_features = 10000)
vz = vectorizer.fit_transform(ab_text_processed)
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))

In [8]:
#tfidf.keys()

In [9]:
print("asian: " + str(tfidf["asian"]))


asian: 7.70517999316

In [10]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=50, random_state=0)
svd_tfidf = svd.fit_transform(vz[:6532])

In [11]:
from sklearn.manifold import TSNE

tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
tsne_tfidf = tsne_model.fit_transform(svd_tfidf)


[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 1000 / 6532
[t-SNE] Computed conditional probabilities for sample 2000 / 6532
[t-SNE] Computed conditional probabilities for sample 3000 / 6532
[t-SNE] Computed conditional probabilities for sample 4000 / 6532
[t-SNE] Computed conditional probabilities for sample 5000 / 6532
[t-SNE] Computed conditional probabilities for sample 6000 / 6532
[t-SNE] Computed conditional probabilities for sample 6532 / 6532
[t-SNE] Mean sigma: 0.000000
[t-SNE] Error after 100 iterations with early exaggeration: 20.508029
[t-SNE] Error after 236 iterations: 2.424675

In [12]:
#from tornado import gen
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show
from bokeh.io import output_notebook

output_notebook()
plot_tfidf = figure(plot_width=900, plot_height=700, title="NY Times - plot (tf-idf)",
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)

plot_tfidf.scatter(x=tsne_tfidf[:,0], y=tsne_tfidf[:,1],
                    source=bp.ColumnDataSource({
                        "text": ab_text[:6532], 
                        "processed": ab_text_processed[:6532]
                    }))

hover = plot_tfidf.select(dict(type=HoverTool))
hover.tooltips={"text": "@text (processed: \"@processed\")"}
show(plot_tfidf)


Loading BokehJS ...
Out[12]:

<Bokeh Notebook handle for In[12]>


In [13]:
from sklearn.cluster import MiniBatchKMeans

num_clusters = 10
kmeans_model = MiniBatchKMeans(n_clusters=num_clusters, init='k-means++', n_init=1, 
                         init_size=1000, batch_size=1000, verbose=False, max_iter=1000)
kmeans = kmeans_model.fit(vz)
kmeans_clusters = kmeans.predict(vz)
kmeans_distances = kmeans.transform(vz)

In [15]:
sorted_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(num_clusters):
    print("Cluster %d:" % i)
    for j in sorted_centroids[i, :10]:
        print(' %s' % terms[j])
    print()


Cluster 0:
 international
 herald
 tribune
 archives
 highlights
 from
 what
 1914
 1939
 1964
()
Cluster 1:
 american
 north
 the
 million
 theaters
 weekend
 new
 last
 ballet
 estimated
()
Cluster 2:
 the
 new
 company
 times
 first
 artist
 said
 bank
 would
 one
()
Cluster 3:
 president
 obama
 obamas
 one
 vice
 former
 ask
 immigration
 question
 the
()
Cluster 4:
 york
 week
 fashion
 new
 scenes
 times
 diary
 photo
 past
 appeared
()
Cluster 5:
 day
 emailed
 newsroom
 clip
 managers
 daily
 senior
 executives
 report
 media
()
Cluster 6:
 republican
 washington
 senator
 senate
 series
 representative
 mcconnell
 latest
 mitch
 race
()
Cluster 7:
 photos
 hong
 kong
 turkey
 india
 ukraine
 syria
 democracy
 china
 israel
()
Cluster 8:
 white
 house
 obama
 president
 wednesday
 the
 fence
 collar
 ebola
 said
()
Cluster 9:
 new
 one
 in
 year
 said
 week
 can
 first
 two
 best
()

In [16]:
tsne_kmeans = tsne_model.fit_transform(kmeans_distances[:6532])


[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 1000 / 6532
[t-SNE] Computed conditional probabilities for sample 2000 / 6532
[t-SNE] Computed conditional probabilities for sample 3000 / 6532
[t-SNE] Computed conditional probabilities for sample 4000 / 6532
[t-SNE] Computed conditional probabilities for sample 5000 / 6532
[t-SNE] Computed conditional probabilities for sample 6000 / 6532
[t-SNE] Computed conditional probabilities for sample 6532 / 6532
[t-SNE] Mean sigma: 0.000000
[t-SNE] Error after 100 iterations with early exaggeration: 17.726824
[t-SNE] Error after 336 iterations: 2.178226

In [19]:
import numpy as np

colormap = np.array([
    "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c", 
    "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5", 
    "#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f", 
    "#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5"
])

plot_kmeans = bp.figure(plot_width=900, plot_height=700, title="NY-Times (k-means)",
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)

plot_kmeans.scatter(x=tsne_kmeans[:,0], y=tsne_kmeans[:,1], 
                    color=colormap[kmeans_clusters][:6532], 
                    source=bp.ColumnDataSource({
                        "Abstract": ab_text[:6532], 
                        "processed": ab_text_processed[:6532],
                        "cluster": kmeans_clusters[:6532]
                    }))
hover = plot_kmeans.select(dict(type=HoverTool))
hover.tooltips={"tweet": "@tweet (processed: \"@processed\" - cluster: @cluster)"}
show(plot_kmeans)


Out[19]:

<Bokeh Notebook handle for In[19]>


In [21]:
import lda
from sklearn.feature_extraction.text import CountVectorizer

cvectorizer = CountVectorizer(min_df=4, max_features=10000, stop_words='english')
cvz = cvectorizer.fit_transform(ab_text_processed)

n_topics = 15
n_iter = 2000
lda_model = lda.LDA(n_topics=n_topics, n_iter=n_iter)
X_topics = lda_model.fit_transform(cvz)

In [22]:
n_top_words = 8
topic_summaries = []

topic_word = lda_model.topic_word_  # get the topic words
vocab = cvectorizer.get_feature_names()
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(' '.join(topic_words))
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))


Topic 0: states united writes photos reuters breakingviews news china
Topic 1: senate senator republican campaign democratic election gov new
Topic 2: week fashion diary scenes new york photo metropolitan
Topic 3: international herald archives tribune highlights war 1939 1964
Topic 4: new company business social online apple internet start
Topic 5: new mr year film old american discusses director
Topic 6: ebola students chinese china readers university world rights
Topic 7: new york times past year great appeared articles
Topic 8: day times media report collection executives daily senior
Topic 9: best choose video word blank time 10 album
Topic 10: new people make think study good time like
Topic 11: bank banks financial federal wall street said chief
Topic 12: new art share artist day museum home summer
Topic 13: president obama white house said state general united
Topic 14: company billion said million fund deal year percent

In [23]:
tsne_lda = tsne_model.fit_transform(X_topics[:6532])


[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 1000 / 6532
[t-SNE] Computed conditional probabilities for sample 2000 / 6532
[t-SNE] Computed conditional probabilities for sample 3000 / 6532
[t-SNE] Computed conditional probabilities for sample 4000 / 6532
[t-SNE] Computed conditional probabilities for sample 5000 / 6532
[t-SNE] Computed conditional probabilities for sample 6000 / 6532
[t-SNE] Computed conditional probabilities for sample 6532 / 6532
[t-SNE] Mean sigma: 0.000000
[t-SNE] Error after 100 iterations with early exaggeration: 18.997276
[t-SNE] Error after 340 iterations: 2.406251

In [24]:
doc_topic = lda_model.doc_topic_
lda_keys = []
for i, row in enumerate(dataSet):
    lda_keys += [doc_topic[i].argmax()]

In [25]:
plot_lda = bp.figure(plot_width=900, plot_height=700, title="NY - Times (LDA)",
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)

plot_lda.scatter(x=tsne_lda[:,0], y=tsne_lda[:,1], 
                 color=colormap[lda_keys][:10000], 
                 source=bp.ColumnDataSource({
                    "Text": ab_text[:6532], 
                    "processed": ab_text_processed[:6532],
                    "topic_key": lda_keys[:6532]
                }))
hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips={"Text": "@text (processed: \"@processed\" - topic: @topic_key)"}
show(plot_lda)


Out[25]:

<Bokeh Notebook handle for In[25]>