In [ ]:
import json
import itertools
import pickle
import hickle
import gzip
import operator
import os
import sys
from time import time
import pprint as pp
import collections
import ConfigParser
from collections import Counter
from operator import itemgetter
import numpy as np
import pandas as pd
import twitter
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cross_validation import train_test_split
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.externals import joblib
from sklearn.feature_extraction import text
# bokeh
import bokeh.plotting as bkplt
from bokeh.charts import *
from bokeh.io import output_notebook
from bokeh.charts import Histogram, show
# import requirments
from IPython.display import Image
from IPython.display import display
import matplotlib.pyplot as plt
import json
import rpy2
%load_ext rpy2.ipython
%R require("ggplot2")
% matplotlib inline
from ggplot import *
randn = np.random.randn
Let's grab some json records from Twitter's public api.
We'll use python-twitter.
$ pip install python-twitter $ pydoc twitter.Api
Build an app https://apps.twitter.com/.
Then use the app info in the config.cfg file.
In [ ]:
# read the config file.
config = ConfigParser.RawConfigParser()
config.read('config.cfg')
# creds found in your Twitter app. See https://apps.twitter.com/
token = config.get('oauth','token')
token_secret = config.get('oauth','token_secret')
con_key = config.get('oauth','con_key')
con_secret_key = config.get('oauth','con_secret_key')
# setup
api = twitter.Api(
consumer_key=con_key
, consumer_secret=con_secret_key
, access_token_key = token
, access_token_secret = token_secret)
# test creds
print "@{}".format(api.VerifyCredentials().GetScreenName())
In [ ]:
# get ~5000 tweets from the public API.
results = api.GetSearch(term = 'golden retriever', count = 100, include_entities=True)
counter = 1
total_tweets = 5000
tweets = []
while counter <= total_tweets:
if counter == 1:
new_results = api.GetSearch(term = 'golden retriever'
, count = 100
, max_id = results[-1].GetId()
, include_entities=True)
else:
new_results = api.GetSearch(term = 'golden retriever'
, count = 100
, max_id = new_results[-1].GetId()
, include_entities=True)
counter += len(new_results)
tweets.extend(new_results)
# store tweets
tweet_text = [tweet.GetText() for tweet in tweets]
print len(tweet_text)
pickle.dump(tweet_text,open('./data/tweet_text.pkl','wb'))
In [ ]:
# Set up a training and test set.
def create_index(total_tweets):
"""
Builds an index for the training and test set.
The sets serve as a list of row numbers to extract from the dataset.
"""
# based on the total tweet count, create an array of all line numbers
line_index = np.array(range(0,total_tweets))
# split the array into training and test sets of index values
trainIndex,testIndex = train_test_split(line_index,train_size=0.70, random_state=42)
# save test & traning index values
#np.save("training_index",trainIndex)
#np.save("testing_index",testIndex)
return trainIndex,testIndex
# build indicies
trainIndex,testIndex = create_index(len(tweet_text))
pickle.dump(trainIndex,open('data/trainIndex.pkl','wb'))
pickle.dump(testIndex,open('data/testIndex.pkl','wb'))
# build test set
test_tweets = [tweet_text[i] for i in testIndex]
pickle.dump(test_tweets,open('data/test_tweets.pkl','wb'))
train_tweets = [tweet_text[i] for i in trainIndex]
pickle.dump(test_tweets,open('data/train_tweets.pkl','wb'))
print "train: {:,}".format(len(train_tweets))
print "test: {:,}".format(len(test_tweets))
In [ ]:
# Set up a vecterizer.
# see http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
def vectorize_1():
vectorizer = TfidfVectorizer(#min_df=20
stop_words='english'
#, sublinear_tf=True
, use_idf=True # enable inverse-document-frequency reweighting
, ngram_range=(1,2) # given our vocab, not really necessary
, binary = True # presence of word instead of frequency
#, vocabulary = vocab
)
#X = vectorizer.fit_transform(tweet_list)
return vectorizer
def vectorize_2(vocab):
vectorizer = CountVectorizer(stop_words='english'
, ngram_range=(1,2) # given our vocab, not really necessary
, binary = True # presence of word instead of frequency
, vocabulary = set(vocab)
)
#X = vectorizer.fit_transform(tweet_list)
return vectorizer
In [ ]:
# Vectorize the tweets to build the vocabulary.
vectorizer = vectorize_1()
X = vectorizer.fit_transform(train_tweets)
vocab = vectorizer.get_feature_names()
pickle.dump(vocab, open('./data/vocab'+str(X.get_shape()[1])+'.pkl','wb'))
print "Totals prior to applying SVD:"
print " tweets: {:,}".format(X.get_shape()[0])
print " vocabulary terms: {:,}".format(X.get_shape()[1])
In [ ]:
#explained_variances = np.var(X_svd, axis=0) / np.var(X_train, axis=0).sum()
def create_svd_doc_term_matrix(X_train, num_eigen_vectors=100):
"""
Create the array with truncated svd.
"""
# Build the fuction to create the svd space
svd = TruncatedSVD(n_components = num_eigen_vectors)
# Apply normalization in place to each row of the data
pipeline = make_pipeline(svd, Normalizer(copy=False))
return pipeline.fit_transform(X_train), svd
In [ ]:
explained_variance_list = []
# The number of svd components to explore
svd_component_range = range(1,100,10)
# finds the explained variance for each number of components
for i in svd_component_range:
# find explained variance (i in this case is the number of components to use)
X_svd, svd = create_svd_doc_term_matrix(X,i)
explained_variance_list.append(svd.explained_variance_ratio_.sum())
expVar = pd.DataFrame({'explained_var':explained_variance_list
, 'components':svd_component_range})
display(expVar)
display(expVar.plot(x='components',y='explained_var'))
Based on the above graphs, we determine the number of the svd components to use. Typically, we want to shoot for about 90% of the variance to be explained in the data set.
In [ ]:
n_components = 25
X_svd, svd = create_svd_doc_term_matrix(X,n_components)
pickle.dump(svd, open('./data/svd_comp'+str(n_components)+'.pkl','wb'))
X_svd.shape
In [ ]:
print "Totals prior to applying SVD:"
print " tweets: {:,}".format(X_svd.shape[0])
print " vocabulary terms: {:,}".format(X_svd.shape[1]) # it's not exactly correct to say that these are vocabulary terms since each dimension is a linear combination of many terms in SVD space, but the point is that we're redicusing the dimensionality.
In [ ]:
def build_clusters(X_svd, k=5):
"""
Use kmeans to find centroids.
"""
km = KMeans(n_clusters=k
, init='k-means++'
, max_iter=100
#, n_init=10
, verbose=False)
km.fit(X_svd)
pred=km.predict(X_svd)
pred_df=pd.DataFrame(pred)
pred_df.columns=['pred_cluster']
return km.cluster_centers_ , pred_df, k, km
Choosing k, the number of clusters, can involve much more analaysis than this tutorial is targeting. See @jrmontag's insightful tutorial for more deatils: choosing-k-in-kmeans.
In [ ]:
# Choose number of clusters
my_k = 10
In [ ]:
# Build centroids
centroids, predictions, n_clusters, km = build_clusters(X_svd, my_k)
pickle.dump(centroids, open('./data/centroids'+str(n_clusters)+'.pkl','wb'))
pickle.dump(predictions, open('./data/predictions'+str(n_clusters)+'.pkl','wb'))
pickle.dump(km, open('./data/km'+str(n_clusters)+'.pkl','wb'))
In [ ]:
original_space_centroids = svd.inverse_transform(centroids)
order_centroids = original_space_centroids.argsort()[:, ::-1]
In [ ]:
for i in range(my_k):
print("Top words for cluster %d:" % i)
for ind in order_centroids[i, :20]:
print(' %s' % vocab[ind])
print
If these clusters seem opaque, we might want to start manipulating the features. Featuring engineering is a broad topic beyond the extent of this tutorial. One suggestion: consider using only the nouns from the tweets to build the vocabulary used in the vectorizer.
In [ ]:
class TopicModel():
"""
Label new tweets w/ previously established centroids and vocabulary.
"""
def __init__(self):
"""
Load and initialize any external models or data here.
"""
self.km = pickle.load(open('./data/km'+str(n_clusters)+'.pkl'))
self.svd = pickle.load(open('./data/svd_comp'+str(n_components)+'.pkl'))
self.vocab = pickle.load(open('./data/vocab'+str(X.get_shape()[1])+'.pkl'))
self.stop_words = text.ENGLISH_STOP_WORDS.union(['http','https','amp'])
self.vectorizer = CountVectorizer(
stop_words=self.stop_words
, binary = True # presence of word instead of frequency
, vocabulary = self.vocab
)
self.tweets = pickle.load(open('./data/test_tweets.pkl'))
def enrichment_value(self):
"""
Calculates the nearest cluster for an unlabeled tweet using the vocab and cluster centers from the training set.
"""
tweetTxt = self.tweets
# vectorize the tweet
X = self.vectorizer.fit_transform(tweetTxt)
X_svd = self.svd.transform(X)
labels = self.km.predict(X_svd)
return labels
def __repr__(self):
""" Add a description of the class's function here """
return("Tweets vectorized using CountVectorizer to icludes 2grams in the vocab. \
The 50 topic clusters are built from Twitter data from the public api, \
SVD used to reduce dimensions and kmeans for centroids. New tweets are \
labled by their nearness to centroids. Result returned provide score.")
In [ ]:
# Create instance of model and label tweets
model = TopicModel()
test_data = model.enrichment_value()
type(test_data)
data={ str(k):[v] for k,v in Counter(test_data).items()}
#print { for k,v in sorted(data.items(),key=itemgetter(1),reverse=True)}
clusterID_df = pd.DataFrame(data).transpose()
clusterID_df.columns = ['count']
display(clusterID_df.transpose())
Each time we apply kmeans, we may have some variation in the results. Developing some consistency in these results could align with the goals of our work. Below are some considerations.
1.) Review the stability of distribution of the labels on the test set. If we re-run the process, does the distribution change dramatically?
2.) Review the "meaning" of the word loadings. Does the choice of k largely affect cluster terms?
3.) Consider new features. Start broad and then use SVD to narrow your selection.
As a starting point, we will simply reiew the clusterID distribution using three different plotting methods:
In [ ]:
# review clusterID distribution
display(clusterID_df)
clusterID_df.plot(kind='bar',title='Cluster ID Distribution',rot=0)
In [ ]:
# review clusterID distribution
output_notebook()
df = pd.DataFrame({'values': list(clusterID_df['count'].values), 'names':list(clusterID_df.transpose().columns.values)})
display(df)
bar = Bar(df, 'names', values='values', title="Cluster ID Distribution", ylabel="Count", xlabel="Cluster ID")
show(bar)
In [ ]:
# push the variable to R
df = pd.DataFrame({'values': [int(item) for item in list(clusterID_df['count'].values)], 'names':list(clusterID_df.transpose().columns.values)})
display(df)
%Rpush df
In [ ]:
%%R
ggplot(data=df,) + geom_bar(stat='identity',aes(x=names, y=values), color='white',fill='blue') + ggtitle("Cluster ID Distribution")+ xlab('Cluster ID')+ylab('Count')