In [1]:
import pandas as pd
import os.path
#import textmining - breaks with float objects
from sklearn.feature_extraction.text import CountVectorizer
path = '../NYTimes_Data/'
dataSet = pd.read_csv(os.path.join(path, 'NYTimesBlogTrain.csv'))
In [2]:
dataSet.head()
Out[2]:
In [3]:
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
#tknzr = StringTokenizer()
nltk.download('stopwords')
stop = stopwords.words('english')
# custom stopwords
#stop += ['<hashtag>', '<url>', '<allcaps>', '<number>', '<user>', '<repeat>', '<elong>', 'websummit']
dataSet["processed"] = ""
In [4]:
processed = []
for cnt, row in dataSet.iterrows():
Abstract_Value = row['Abstract']
if len(str(Abstract_Value)) > 0:
parts = word_tokenize(str(Abstract_Value))
clean = [i for i in parts if i not in stop]
else:
clean = ''
processed.append(clean)
#dataSet.at[cnt,'processed'] = clean
In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
ab_text = [data['Abstract'] for cnt,data in dataSet.iterrows()] # list of all tweet texts
ab_text_processed = [str.join(" ", data) for data in processed] # list of pre-processed tweet texts
In [6]:
ab_text_processed[1:2]
Out[6]:
In [7]:
vectorizer = TfidfVectorizer(min_df=4, max_features = 10000)
vz = vectorizer.fit_transform(ab_text_processed)
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
In [8]:
#tfidf.keys()
In [9]:
print("asian: " + str(tfidf["asian"]))
In [10]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=50, random_state=0)
svd_tfidf = svd.fit_transform(vz[:6532])
In [11]:
from sklearn.manifold import TSNE
tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
tsne_tfidf = tsne_model.fit_transform(svd_tfidf)
In [12]:
#from tornado import gen
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
output_notebook()
plot_tfidf = figure(plot_width=900, plot_height=700, title="NY Times - plot (tf-idf)",
tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
x_axis_type=None, y_axis_type=None, min_border=1)
plot_tfidf.scatter(x=tsne_tfidf[:,0], y=tsne_tfidf[:,1],
source=bp.ColumnDataSource({
"text": ab_text[:6532],
"processed": ab_text_processed[:6532]
}))
hover = plot_tfidf.select(dict(type=HoverTool))
hover.tooltips={"text": "@text (processed: \"@processed\")"}
show(plot_tfidf)
Out[12]:
In [13]:
from sklearn.cluster import MiniBatchKMeans
num_clusters = 10
kmeans_model = MiniBatchKMeans(n_clusters=num_clusters, init='k-means++', n_init=1,
init_size=1000, batch_size=1000, verbose=False, max_iter=1000)
kmeans = kmeans_model.fit(vz)
kmeans_clusters = kmeans.predict(vz)
kmeans_distances = kmeans.transform(vz)
In [15]:
sorted_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(num_clusters):
print("Cluster %d:" % i)
for j in sorted_centroids[i, :10]:
print(' %s' % terms[j])
print()
In [16]:
tsne_kmeans = tsne_model.fit_transform(kmeans_distances[:6532])
In [19]:
import numpy as np
colormap = np.array([
"#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c",
"#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5",
"#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f",
"#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5"
])
plot_kmeans = bp.figure(plot_width=900, plot_height=700, title="NY-Times (k-means)",
tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
x_axis_type=None, y_axis_type=None, min_border=1)
plot_kmeans.scatter(x=tsne_kmeans[:,0], y=tsne_kmeans[:,1],
color=colormap[kmeans_clusters][:6532],
source=bp.ColumnDataSource({
"Abstract": ab_text[:6532],
"processed": ab_text_processed[:6532],
"cluster": kmeans_clusters[:6532]
}))
hover = plot_kmeans.select(dict(type=HoverTool))
hover.tooltips={"tweet": "@tweet (processed: \"@processed\" - cluster: @cluster)"}
show(plot_kmeans)
Out[19]:
In [21]:
import lda
from sklearn.feature_extraction.text import CountVectorizer
cvectorizer = CountVectorizer(min_df=4, max_features=10000, stop_words='english')
cvz = cvectorizer.fit_transform(ab_text_processed)
n_topics = 15
n_iter = 2000
lda_model = lda.LDA(n_topics=n_topics, n_iter=n_iter)
X_topics = lda_model.fit_transform(cvz)
In [22]:
n_top_words = 8
topic_summaries = []
topic_word = lda_model.topic_word_ # get the topic words
vocab = cvectorizer.get_feature_names()
for i, topic_dist in enumerate(topic_word):
topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
topic_summaries.append(' '.join(topic_words))
print('Topic {}: {}'.format(i, ' '.join(topic_words)))
In [23]:
tsne_lda = tsne_model.fit_transform(X_topics[:6532])
In [24]:
doc_topic = lda_model.doc_topic_
lda_keys = []
for i, row in enumerate(dataSet):
lda_keys += [doc_topic[i].argmax()]
In [25]:
plot_lda = bp.figure(plot_width=900, plot_height=700, title="NY - Times (LDA)",
tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
x_axis_type=None, y_axis_type=None, min_border=1)
plot_lda.scatter(x=tsne_lda[:,0], y=tsne_lda[:,1],
color=colormap[lda_keys][:10000],
source=bp.ColumnDataSource({
"Text": ab_text[:6532],
"processed": ab_text_processed[:6532],
"topic_key": lda_keys[:6532]
}))
hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips={"Text": "@text (processed: \"@processed\" - topic: @topic_key)"}
show(plot_lda)
Out[25]: