In [ ]:
import json
import itertools
import pickle
import hickle 
import gzip
import operator
import os
import sys
from time import time
import pprint as pp
import collections
import ConfigParser
from collections import Counter
from operator import itemgetter

import numpy as np
import pandas as pd

import twitter

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cross_validation import train_test_split
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.externals import joblib
from sklearn.feature_extraction import text 

# bokeh
import bokeh.plotting as bkplt
from bokeh.charts import *
from bokeh.io import output_notebook
from bokeh.charts import Histogram, show

# import requirments 
from IPython.display import Image
from IPython.display import display
import matplotlib.pyplot as plt
import json
import rpy2
%load_ext rpy2.ipython
%R require("ggplot2")
% matplotlib inline
from ggplot import *
randn = np.random.randn

Setup & Creds

Let's grab some json records from Twitter's public api.

We'll use python-twitter.

$ pip install python-twitter
$ pydoc twitter.Api

Build an app https://apps.twitter.com/.

Then use the app info in the config.cfg file.


In [ ]:
# read the config file.
config = ConfigParser.RawConfigParser()
config.read('config.cfg')

# creds found in your Twitter app. See https://apps.twitter.com/
token = config.get('oauth','token')
token_secret = config.get('oauth','token_secret')
con_key = config.get('oauth','con_key')
con_secret_key = config.get('oauth','con_secret_key')

# setup 
api = twitter.Api(
    consumer_key=con_key
    , consumer_secret=con_secret_key
    , access_token_key = token
    , access_token_secret = token_secret)

# test creds
print "@{}".format(api.VerifyCredentials().GetScreenName())

In [ ]:
# get ~5000 tweets from the public API.
results = api.GetSearch(term = 'golden retriever', count = 100, include_entities=True)                
counter = 1
total_tweets = 5000
tweets = []
while counter <= total_tweets:
    if counter == 1:
        new_results = api.GetSearch(term = 'golden retriever'
                                    , count = 100
                                    , max_id = results[-1].GetId()
                                    , include_entities=True)
    else:
        new_results = api.GetSearch(term = 'golden retriever'
                                    , count = 100
                                    , max_id = new_results[-1].GetId()
                                    , include_entities=True)
    counter += len(new_results)
    tweets.extend(new_results)
# store tweets
tweet_text = [tweet.GetText() for tweet in tweets]
print len(tweet_text)
pickle.dump(tweet_text,open('./data/tweet_text.pkl','wb'))

Train/Test set

Split the training and test set.


In [ ]:
# Set up a training and test set.

def create_index(total_tweets):
        """
        Builds an index for the training and test set.
        The sets serve as a list of row numbers to extract from the dataset. 
        """
        # based on the total tweet count, create an array of all line numbers 
        line_index = np.array(range(0,total_tweets))
        # split the array into training and test sets of index values
        trainIndex,testIndex = train_test_split(line_index,train_size=0.70, random_state=42)
        # save test & traning index values
        #np.save("training_index",trainIndex)
        #np.save("testing_index",testIndex)
        return trainIndex,testIndex

# build indicies 
trainIndex,testIndex = create_index(len(tweet_text))
pickle.dump(trainIndex,open('data/trainIndex.pkl','wb'))
pickle.dump(testIndex,open('data/testIndex.pkl','wb'))

# build test set
test_tweets = [tweet_text[i] for i in testIndex]
pickle.dump(test_tweets,open('data/test_tweets.pkl','wb'))

train_tweets = [tweet_text[i] for i in trainIndex]
pickle.dump(test_tweets,open('data/train_tweets.pkl','wb'))
print "train: {:,}".format(len(train_tweets))
print "test: {:,}".format(len(test_tweets))

Vectorize the Tweets

Two steps:

  1. Set up a vectorizer.
  2. Vectorize the tweets to build the vocabulary.

In [ ]:
# Set up a vecterizer.
# see http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

def vectorize_1():
    vectorizer = TfidfVectorizer(#min_df=20
                                 stop_words='english'
                                 #, sublinear_tf=True
                                 , use_idf=True # enable inverse-document-frequency reweighting
                                 , ngram_range=(1,2) # given our vocab, not really necessary
                                 , binary = True # presence of word instead of frequency
                                 #, vocabulary = vocab
                                ) 
    #X = vectorizer.fit_transform(tweet_list)
    return vectorizer

def vectorize_2(vocab):
    vectorizer = CountVectorizer(stop_words='english'
                                 , ngram_range=(1,2) # given our vocab, not really necessary
                                 , binary = True # presence of word instead of frequency
                                 , vocabulary = set(vocab)
                                ) 
    #X = vectorizer.fit_transform(tweet_list)
    return vectorizer

In [ ]:
# Vectorize the tweets to build the vocabulary.
vectorizer = vectorize_1()
X = vectorizer.fit_transform(train_tweets)
vocab = vectorizer.get_feature_names()
pickle.dump(vocab, open('./data/vocab'+str(X.get_shape()[1])+'.pkl','wb')) 
print "Totals prior to applying SVD:"
print " tweets: {:,}".format(X.get_shape()[0])
print " vocabulary terms: {:,}".format(X.get_shape()[1])

Dimension Reduction

To choose the appropriate number of svd components, we need to explore the amount of variance explained with each component. We'll reduce the number of components that explain approximately 90% of the variance.


In [ ]:
#explained_variances = np.var(X_svd, axis=0) / np.var(X_train, axis=0).sum()
def create_svd_doc_term_matrix(X_train, num_eigen_vectors=100):
    """
    Create the array with truncated svd.
    """
    # Build the fuction to create the svd space
    svd = TruncatedSVD(n_components = num_eigen_vectors)
    # Apply normalization in place to each row of the data 
    pipeline = make_pipeline(svd, Normalizer(copy=False))
    return pipeline.fit_transform(X_train), svd

In [ ]:
explained_variance_list = []
# The number of svd components to explore
svd_component_range = range(1,100,10)

# finds the explained variance for each number of components
for i in svd_component_range:
    # find explained variance (i in this case is the number of components to use)
    X_svd, svd = create_svd_doc_term_matrix(X,i)
    explained_variance_list.append(svd.explained_variance_ratio_.sum())

expVar = pd.DataFrame({'explained_var':explained_variance_list
                   , 'components':svd_component_range})
display(expVar)

display(expVar.plot(x='components',y='explained_var'))

Based on the above graphs, we determine the number of the svd components to use. Typically, we want to shoot for about 90% of the variance to be explained in the data set.


In [ ]:
n_components = 25
X_svd, svd = create_svd_doc_term_matrix(X,n_components)
pickle.dump(svd, open('./data/svd_comp'+str(n_components)+'.pkl','wb')) 
X_svd.shape

In [ ]:
print "Totals prior to applying SVD:"
print " tweets: {:,}".format(X_svd.shape[0])
print " vocabulary terms: {:,}".format(X_svd.shape[1]) # it's not exactly correct to say that these are vocabulary terms since each dimension is a linear combination of many terms in SVD space, but the point is that we're redicusing the dimensionality.

Create Cluster Centroids

We'll now apply kmeans to find the centroids that will be used to predict a cluster for each tweet.


In [ ]:
def build_clusters(X_svd, k=5):
    """
    Use kmeans to find centroids.
    """
    km = KMeans(n_clusters=k
                , init='k-means++'
                , max_iter=100
                #, n_init=10
                , verbose=False)
    km.fit(X_svd)
    pred=km.predict(X_svd)
    pred_df=pd.DataFrame(pred)
    pred_df.columns=['pred_cluster']
    return km.cluster_centers_ , pred_df, k, km

Choosing k, the number of clusters, can involve much more analaysis than this tutorial is targeting. See @jrmontag's insightful tutorial for more deatils: choosing-k-in-kmeans.


In [ ]:
# Choose number of clusters
my_k = 10

In [ ]:
# Build centroids
centroids, predictions, n_clusters, km = build_clusters(X_svd, my_k)
pickle.dump(centroids, open('./data/centroids'+str(n_clusters)+'.pkl','wb'))
pickle.dump(predictions, open('./data/predictions'+str(n_clusters)+'.pkl','wb')) 
pickle.dump(km, open('./data/km'+str(n_clusters)+'.pkl','wb'))

Explore Word Loadings

Those tweets nearest the cluster centers are used as an approximation for their meanings.


In [ ]:
original_space_centroids = svd.inverse_transform(centroids)
order_centroids = original_space_centroids.argsort()[:, ::-1]

In [ ]:
for i in range(my_k):
    print("Top words for cluster %d:" % i)
    for ind in order_centroids[i, :20]:
        print(' %s' % vocab[ind])
    print

If these clusters seem opaque, we might want to start manipulating the features. Featuring engineering is a broad topic beyond the extent of this tutorial. One suggestion: consider using only the nouns from the tweets to build the vocabulary used in the vectorizer.

Label New Tweets

Apply the model to the test set.


In [ ]:
class TopicModel():
    """
    Label new tweets w/ previously established centroids and vocabulary.
    """
    def __init__(self):
        """
        Load and initialize any external models or data here.
        """
        self.km = pickle.load(open('./data/km'+str(n_clusters)+'.pkl')) 
        self.svd = pickle.load(open('./data/svd_comp'+str(n_components)+'.pkl'))
        self.vocab = pickle.load(open('./data/vocab'+str(X.get_shape()[1])+'.pkl'))
        self.stop_words = text.ENGLISH_STOP_WORDS.union(['http','https','amp'])
        self.vectorizer = CountVectorizer(
                stop_words=self.stop_words
                , binary = True # presence of word instead of frequency
                , vocabulary = self.vocab
                )
        self.tweets = pickle.load(open('./data/test_tweets.pkl')) 
    def enrichment_value(self):
        """
        Calculates the nearest cluster for an unlabeled tweet using the vocab and cluster centers from the training set.
        """
        tweetTxt = self.tweets
        # vectorize the tweet
        X = self.vectorizer.fit_transform(tweetTxt)
        X_svd = self.svd.transform(X)
        labels = self.km.predict(X_svd)
        return labels
    def __repr__(self):
        """ Add a description of the class's function here """
        return("Tweets vectorized using CountVectorizer to icludes 2grams in the vocab. \
                The 50 topic clusters are built from Twitter data from the public api,  \
                SVD used to reduce dimensions and kmeans for centroids. New tweets are \
                labled by their nearness to centroids. Result returned provide score.")

In [ ]:
# Create instance of model and label tweets
model = TopicModel()
test_data = model.enrichment_value()
type(test_data)
data={ str(k):[v] for k,v in Counter(test_data).items()}
#print { for k,v in sorted(data.items(),key=itemgetter(1),reverse=True)}
clusterID_df = pd.DataFrame(data).transpose()
clusterID_df.columns = ['count']
display(clusterID_df.transpose())

Review the Results

Each time we apply kmeans, we may have some variation in the results. Developing some consistency in these results could align with the goals of our work. Below are some considerations.

1.) Review the stability of distribution of the labels on the test set. If we re-run the process, does the distribution change dramatically?
2.) Review the "meaning" of the word loadings. Does the choice of k largely affect cluster terms?
3.) Consider new features. Start broad and then use SVD to narrow your selection.

As a starting point, we will simply reiew the clusterID distribution using three different plotting methods:

  • matplotlib
  • BokehJS
  • R

In [ ]:
# review clusterID distribution 
display(clusterID_df)
clusterID_df.plot(kind='bar',title='Cluster ID Distribution',rot=0)

In [ ]:
# review clusterID distribution 
output_notebook()
df = pd.DataFrame({'values': list(clusterID_df['count'].values), 'names':list(clusterID_df.transpose().columns.values)})
display(df)
bar = Bar(df, 'names', values='values', title="Cluster ID Distribution", ylabel="Count", xlabel="Cluster ID")
show(bar)

In [ ]:
# push the variable to R
df = pd.DataFrame({'values': [int(item) for item in list(clusterID_df['count'].values)], 'names':list(clusterID_df.transpose().columns.values)})
display(df)
%Rpush df

In [ ]:
%%R 
ggplot(data=df,) + geom_bar(stat='identity',aes(x=names, y=values), color='white',fill='blue') + ggtitle("Cluster ID Distribution")+ xlab('Cluster ID')+ylab('Count')